diff mbox series

[RFC,3/3] sched/fair: Traverse cpufreq policies to detect capacity inversion

Message ID 20221127141742.1644023-4-qyousef@layalina.io
State New
Headers show
Series Fixes for uclamp and capacity inversion detection | expand

Commit Message

Qais Yousef Nov. 27, 2022, 2:17 p.m. UTC
We used performance domains to traverse the list of domains that share
the same cpufreq policy to detect when this domain is severely impacted
by thermal pressure to cause it to be lower than another domain in the
system - capacity inversion.

Since performance domains are only available for when energy model or
schedutil are present, this makes the detection mechanism unavailable
for Capacity Aware Scheduling (CAS).

Since we only care about traversing the capacity_orig() of any cpu
within that domain; export for_each_active_policy() to traverse the
cpufreq policies instead of performance domains.

Introduce a new for_each_active_policy_safe() to protect against races
with deletion. Races against additions are fine since we can't
eliminate the race without having to do heavy handed locking which is
unacceptable in this path. The policy should be visible in the next
tick if we missed it.

Fixes: 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
---

Rafael, Viresh, I hope it's okay to export these macros in the public header.
And that my usage is okay; I'm not sure if I missed important locking rules.


 drivers/cpufreq/cpufreq.c | 12 +-----------
 include/linux/cpufreq.h   | 26 ++++++++++++++++++++++++++
 kernel/sched/fair.c       | 13 +++++--------
 3 files changed, 32 insertions(+), 19 deletions(-)

Comments

Rafael J. Wysocki Nov. 30, 2022, 6:27 p.m. UTC | #1
On Sun, Nov 27, 2022 at 3:18 PM Qais Yousef <qyousef@layalina.io> wrote:
>
> We used performance domains to traverse the list of domains that share
> the same cpufreq policy to detect when this domain is severely impacted
> by thermal pressure to cause it to be lower than another domain in the
> system - capacity inversion.
>
> Since performance domains are only available for when energy model or
> schedutil are present, this makes the detection mechanism unavailable
> for Capacity Aware Scheduling (CAS).
>
> Since we only care about traversing the capacity_orig() of any cpu
> within that domain; export for_each_active_policy() to traverse the
> cpufreq policies instead of performance domains.
>
> Introduce a new for_each_active_policy_safe() to protect against races
> with deletion. Races against additions are fine since we can't
> eliminate the race without having to do heavy handed locking which is
> unacceptable in this path. The policy should be visible in the next
> tick if we missed it.
>
> Fixes: 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>
> Rafael, Viresh, I hope it's okay to export these macros in the public header.
> And that my usage is okay; I'm not sure if I missed important locking rules.
>
>
>  drivers/cpufreq/cpufreq.c | 12 +-----------
>  include/linux/cpufreq.h   | 26 ++++++++++++++++++++++++++
>  kernel/sched/fair.c       | 13 +++++--------
>  3 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 69b3d61852ac..b11e7c545fc1 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -31,17 +31,7 @@
>  #include <linux/units.h>
>  #include <trace/events/power.h>
>
> -static LIST_HEAD(cpufreq_policy_list);
> -
> -/* Macros to iterate over CPU policies */
> -#define for_each_suitable_policy(__policy, __active)                    \
> -       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> -               if ((__active) == !policy_is_inactive(__policy))
> -
> -#define for_each_active_policy(__policy)               \
> -       for_each_suitable_policy(__policy, true)
> -#define for_each_inactive_policy(__policy)             \
> -       for_each_suitable_policy(__policy, false)
> +LIST_HEAD(cpufreq_policy_list);
>
>  /* Iterate over governors */
>  static LIST_HEAD(cpufreq_governor_list);
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index d5595d57f4e5..c3c79d4ad821 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -780,6 +780,32 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
>                         continue;                                               \
>                 else
>
> +#ifdef CONFIG_CPU_FREQ
> +extern struct list_head cpufreq_policy_list;
> +
> +/* Macros to iterate over CPU policies */
> +#define for_each_suitable_policy(__policy, __active)                    \
> +       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +
> +#define for_each_suitable_policy_safe(__policy, __n, __active)                    \
> +       list_for_each_entry_safe(__policy, __n, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +#else
> +#define for_each_suitable_policy(__policy, __active)           while (0)
> +#define for_each_suitable_policy_safe(__policy, __n, __active) while (0)
> +#endif
> +
> +#define for_each_active_policy(__policy)               \
> +       for_each_suitable_policy(__policy, true)
> +#define for_each_inactive_policy(__policy)             \
> +       for_each_suitable_policy(__policy, false)
> +
> +#define for_each_active_policy_safe(__policy, __n)             \
> +       for_each_suitable_policy_safe(__policy, __n, true)
> +#define for_each_inactive_policy_safe(__policy, __n)           \
> +       for_each_suitable_policy_safe(__policy, __n, false)
> +
>
>  int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
>                                     struct cpufreq_frequency_table *table);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7c0dd57e562a..4bbbca85134b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
>          *   * Thermal pressure will impact all cpus in this perf domain
>          *     equally.
>          */
> -       if (sched_energy_enabled()) {
> +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
>                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
>
>                 rq->cpu_capacity_inverted = 0;
>
> -               SCHED_WARN_ON(!rcu_read_lock_held());
> -
> -               for (; pd; pd = pd->next) {
> -                       struct cpumask *pd_span = perf_domain_span(pd);
> +               for_each_active_policy_safe(policy, policy_n) {

1. Is the "safe" part sufficient for protection against concurrent
deletion and freeing of list entries?  cpufreq driver removal can do
that AFAICS.
2. For a casual reader of this code it may not be clear why cpufreq
policies matter here.

>                         unsigned long pd_cap_orig, pd_cap;
>
>                         /* We can't be inverted against our own pd */
> -                       if (cpumask_test_cpu(cpu_of(rq), pd_span))
> +                       if (cpumask_test_cpu(cpu_of(rq), policy->cpus))
>                                 continue;
>
> -                       cpu = cpumask_any(pd_span);
> +                       cpu = cpumask_any(policy->cpus);
>                         pd_cap_orig = arch_scale_cpu_capacity(cpu);
>
>                         if (capacity_orig < pd_cap_orig)
> --
> 2.25.1
>
Vincent Guittot Dec. 2, 2022, 2:57 p.m. UTC | #2
On Sun, 27 Nov 2022 at 15:18, Qais Yousef <qyousef@layalina.io> wrote:
>
> We used performance domains to traverse the list of domains that share
> the same cpufreq policy to detect when this domain is severely impacted
> by thermal pressure to cause it to be lower than another domain in the
> system - capacity inversion.
>
> Since performance domains are only available for when energy model or
> schedutil are present, this makes the detection mechanism unavailable
> for Capacity Aware Scheduling (CAS).
>
> Since we only care about traversing the capacity_orig() of any cpu
> within that domain; export for_each_active_policy() to traverse the
> cpufreq policies instead of performance domains.
>
> Introduce a new for_each_active_policy_safe() to protect against races
> with deletion. Races against additions are fine since we can't
> eliminate the race without having to do heavy handed locking which is
> unacceptable in this path. The policy should be visible in the next
> tick if we missed it.
>
> Fixes: 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
> ---
>
> Rafael, Viresh, I hope it's okay to export these macros in the public header.
> And that my usage is okay; I'm not sure if I missed important locking rules.
>
>
>  drivers/cpufreq/cpufreq.c | 12 +-----------
>  include/linux/cpufreq.h   | 26 ++++++++++++++++++++++++++
>  kernel/sched/fair.c       | 13 +++++--------
>  3 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 69b3d61852ac..b11e7c545fc1 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -31,17 +31,7 @@
>  #include <linux/units.h>
>  #include <trace/events/power.h>
>
> -static LIST_HEAD(cpufreq_policy_list);
> -
> -/* Macros to iterate over CPU policies */
> -#define for_each_suitable_policy(__policy, __active)                    \
> -       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> -               if ((__active) == !policy_is_inactive(__policy))
> -
> -#define for_each_active_policy(__policy)               \
> -       for_each_suitable_policy(__policy, true)
> -#define for_each_inactive_policy(__policy)             \
> -       for_each_suitable_policy(__policy, false)
> +LIST_HEAD(cpufreq_policy_list);
>
>  /* Iterate over governors */
>  static LIST_HEAD(cpufreq_governor_list);
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index d5595d57f4e5..c3c79d4ad821 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -780,6 +780,32 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
>                         continue;                                               \
>                 else
>
> +#ifdef CONFIG_CPU_FREQ
> +extern struct list_head cpufreq_policy_list;
> +
> +/* Macros to iterate over CPU policies */
> +#define for_each_suitable_policy(__policy, __active)                    \
> +       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +
> +#define for_each_suitable_policy_safe(__policy, __n, __active)                    \
> +       list_for_each_entry_safe(__policy, __n, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +#else
> +#define for_each_suitable_policy(__policy, __active)           while (0)
> +#define for_each_suitable_policy_safe(__policy, __n, __active) while (0)
> +#endif
> +
> +#define for_each_active_policy(__policy)               \
> +       for_each_suitable_policy(__policy, true)
> +#define for_each_inactive_policy(__policy)             \
> +       for_each_suitable_policy(__policy, false)
> +
> +#define for_each_active_policy_safe(__policy, __n)             \
> +       for_each_suitable_policy_safe(__policy, __n, true)
> +#define for_each_inactive_policy_safe(__policy, __n)           \
> +       for_each_suitable_policy_safe(__policy, __n, false)
> +
>
>  int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
>                                     struct cpufreq_frequency_table *table);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7c0dd57e562a..4bbbca85134b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
>          *   * Thermal pressure will impact all cpus in this perf domain
>          *     equally.
>          */
> -       if (sched_energy_enabled()) {
> +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
>                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
>
>                 rq->cpu_capacity_inverted = 0;
>
> -               SCHED_WARN_ON(!rcu_read_lock_held());
> -
> -               for (; pd; pd = pd->next) {
> -                       struct cpumask *pd_span = perf_domain_span(pd);
> +               for_each_active_policy_safe(policy, policy_n) {

So you are looping all cpufreq policy (and before the perf domain) in
the period load balance. That' really not something we should or want
to do


>                         unsigned long pd_cap_orig, pd_cap;
>
>                         /* We can't be inverted against our own pd */
> -                       if (cpumask_test_cpu(cpu_of(rq), pd_span))
> +                       if (cpumask_test_cpu(cpu_of(rq), policy->cpus))
>                                 continue;
>
> -                       cpu = cpumask_any(pd_span);
> +                       cpu = cpumask_any(policy->cpus);
>                         pd_cap_orig = arch_scale_cpu_capacity(cpu);
>
>                         if (capacity_orig < pd_cap_orig)
> --
> 2.25.1
>
Qais Yousef Dec. 3, 2022, 2:32 p.m. UTC | #3
On 11/30/22 19:27, Rafael J. Wysocki wrote:

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7c0dd57e562a..4bbbca85134b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> >          *   * Thermal pressure will impact all cpus in this perf domain
> >          *     equally.
> >          */
> > -       if (sched_energy_enabled()) {
> > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> >
> >                 rq->cpu_capacity_inverted = 0;
> >
> > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > -
> > -               for (; pd; pd = pd->next) {
> > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > +               for_each_active_policy_safe(policy, policy_n) {
> 
> 1. Is the "safe" part sufficient for protection against concurrent
> deletion and freeing of list entries?  cpufreq driver removal can do
> that AFAICS.

The freeing part is not safe probably. I need to research this more. Do you
have issues against the exportation of this traversal in principle?

Switching them to be RCU protected could be the best safe option, anything
against that too? I might not end up needing that. I need to dig more.

> 2. For a casual reader of this code it may not be clear why cpufreq
> policies matter here.

I'm looking for a way to traverse the list of capacities of the system and
know their related CPUs.

AFAICT this information already exists in the performance domains and
cpufreq_policy. Performance domains are conditional to energy model and
schedutil. So trying to switch to cpufreq_policy.

Assuming this question wasn't a request to add a comment :-)


Thanks!

--
Qais Yousef
Qais Yousef Dec. 3, 2022, 2:33 p.m. UTC | #4
On 12/02/22 15:57, Vincent Guittot wrote:

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7c0dd57e562a..4bbbca85134b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> >          *   * Thermal pressure will impact all cpus in this perf domain
> >          *     equally.
> >          */
> > -       if (sched_energy_enabled()) {
> > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> >
> >                 rq->cpu_capacity_inverted = 0;
> >
> > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > -
> > -               for (; pd; pd = pd->next) {
> > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > +               for_each_active_policy_safe(policy, policy_n) {
> 
> So you are looping all cpufreq policy (and before the perf domain) in
> the period load balance. That' really not something we should or want
> to do

Why is it not acceptable in the period load balance but acceptable in the hot
wake up path in feec()? What's the difference?


Thanks!

--
Qais Yousef
Vincent Guittot Dec. 4, 2022, 11:35 a.m. UTC | #5
On Sat, 3 Dec 2022 at 15:33, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 12/02/22 15:57, Vincent Guittot wrote:
>
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 7c0dd57e562a..4bbbca85134b 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > >          *   * Thermal pressure will impact all cpus in this perf domain
> > >          *     equally.
> > >          */
> > > -       if (sched_energy_enabled()) {
> > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > >
> > >                 rq->cpu_capacity_inverted = 0;
> > >
> > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > -
> > > -               for (; pd; pd = pd->next) {
> > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > +               for_each_active_policy_safe(policy, policy_n) {
> >
> > So you are looping all cpufreq policy (and before the perf domain) in
> > the period load balance. That' really not something we should or want
> > to do
>
> Why is it not acceptable in the period load balance but acceptable in the hot
> wake up path in feec()? What's the difference?

This patch loops on all cpufreq policy in sched softirq, how can this
be sane ? and not only in eas mode but also in the default asymmetric
performance  one.

This inverted detection doesn't look like the right way to fix your
problem IMO. That being said, i agree that I haven't made any other
proposal apart that I think that you should use a different rules for
task and for overutilized and part of your problem comes from this.

Then this make eas and util_fits_cpu even more Arm specific and I
still hope to merge sched_asym_cpucapacity and asym_packing a some
levels because they looks  more and more similar but each side is
trying to add some SoC specific policy

Vincent

>
>
> Thanks!
>
> --
> Qais Yousef
Qais Yousef Dec. 5, 2022, 11:01 a.m. UTC | #6
On 12/04/22 12:35, Vincent Guittot wrote:
> On Sat, 3 Dec 2022 at 15:33, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 12/02/22 15:57, Vincent Guittot wrote:
> >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 7c0dd57e562a..4bbbca85134b 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > > >          *   * Thermal pressure will impact all cpus in this perf domain
> > > >          *     equally.
> > > >          */
> > > > -       if (sched_energy_enabled()) {
> > > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > > >
> > > >                 rq->cpu_capacity_inverted = 0;
> > > >
> > > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > > -
> > > > -               for (; pd; pd = pd->next) {
> > > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > > +               for_each_active_policy_safe(policy, policy_n) {
> > >
> > > So you are looping all cpufreq policy (and before the perf domain) in
> > > the period load balance. That' really not something we should or want
> > > to do
> >
> > Why is it not acceptable in the period load balance but acceptable in the hot
> > wake up path in feec()? What's the difference?
> 
> This patch loops on all cpufreq policy in sched softirq, how can this
> be sane ? and not only in eas mode but also in the default asymmetric

Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
in the wake up path in feec()?

AFAICT the number of cpufreq policies and perf domains have 1:1 mapping. In
feec() we not only loop perf domains, but we go through each cpu of each domain
to find max_spare_cap then compute_energy(). Which is more intensive.

In worst case scenario where there's a perf domain for each cpu, then we'll
loop through every CPU *and* compute energy its energy cost in the wake up
path.

Why it's deemed acceptable in wake up path which is more critical AFAIU but not
period load balance which is expected to do more work? What am I missing?

> performance  one.
> 
> This inverted detection doesn't look like the right way to fix your
> problem IMO. That being said, i agree that I haven't made any other
> proposal apart that I think that you should use a different rules for
> task and for overutilized and part of your problem comes from this.

We discussed this before; I need to revisit the thread but I can't see how
overutilized different than task will fix the issue. They should be unified by
design.

I'm all ears if there's a simpler way to address the problem :-)

The problem is that thermal pressure on big cpu is not important from
uclamp perspective until it is in inversion state. It is quite common to have
a system where the medium capacity is in 500 range. If the big is under thermal
pressure that it drops to 800, then it is still a fitting CPU from uclamp
perspective. Keep in mind uclamp_min is useful for tasks whose utilization is
small So we need to be selective when thermal pressure is actually helping out
or just creating unnecessary problems.

The only other option I had in mind was to do the detection when we update the
thermal_pressure in the topology code. But that didn't look better alternative
to me.

> 
> Then this make eas and util_fits_cpu even more Arm specific and I

What is the Arm specific part about it? Why it wouldn't work on non-Arm
systems?

> still hope to merge sched_asym_cpucapacity and asym_packing a some
> levels because they looks  more and more similar but each side is
> trying to add some SoC specific policy

Oh, it seems Intel relies on asym_packing for their hybrid support approach?
I think sched_asym_cpucapacity was designed to be generic. If I gathered
correctly lack of support for SMT and inability to provide energy model outside
of DT were some required extensions.

To be honest, I personally think EAS can be useful on SMP systems and it would
be nice to enable it outside of sched_asym_cpucapacity.

I'm interested to hear more about this unification idea actually. If you feel
a bit chatty to describe in more detail how do you see this being unified, that
could be enlightening for some of us who work in this area :-)


Thanks!

--
Qais Yousef
Rafael J. Wysocki Dec. 5, 2022, 12:39 p.m. UTC | #7
On Sat, Dec 3, 2022 at 3:32 PM Qais Yousef <qyousef@layalina.io> wrote:
>
> On 11/30/22 19:27, Rafael J. Wysocki wrote:
>
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 7c0dd57e562a..4bbbca85134b 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > >          *   * Thermal pressure will impact all cpus in this perf domain
> > >          *     equally.
> > >          */
> > > -       if (sched_energy_enabled()) {
> > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > >
> > >                 rq->cpu_capacity_inverted = 0;
> > >
> > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > -
> > > -               for (; pd; pd = pd->next) {
> > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > +               for_each_active_policy_safe(policy, policy_n) {
> >
> > 1. Is the "safe" part sufficient for protection against concurrent
> > deletion and freeing of list entries?  cpufreq driver removal can do
> > that AFAICS.
>
> The freeing part is not safe probably.

Well, I don't even think that the traversal part is safe against
concurrent removal of list entries (it is safe against removal of list
entries in the loop itself).

list_for_each_entry_safe() assumes that n will always point to a valid
list entry, but what if the entry pointed to by it is deleted by a
concurrent thread?

> I need to research this more. Do you
> have issues against the exportation of this traversal in principle?
>
> Switching them to be RCU protected could be the best safe option, anything
> against that too?

Not really, it just occurred to me that you may end up dealing with a
corrupted list here.

> I might not end up needing that. I need to dig more.
>
> > 2. For a casual reader of this code it may not be clear why cpufreq
> > policies matter here.
>
> I'm looking for a way to traverse the list of capacities of the system and
> know their related CPUs.

So why don't you mention this in a comment, for instance?

> AFAICT this information already exists in the performance domains and
> cpufreq_policy. Performance domains are conditional to energy model and
> schedutil. So trying to switch to cpufreq_policy.
>
> Assuming this question wasn't a request to add a comment :-)

Yes, it was. :-)

That said though, this change makes the scheduler kind of depend on
cpufreq which feels a bit like a corner cut TBH.

I do realize that cpufreq happens to be maintaining a data structure
that turns out to be useful here, but the reason why it is there has
nothing to do with this code AFAICS.
Qais Yousef Dec. 5, 2022, 2:09 p.m. UTC | #8
On 12/05/22 13:39, Rafael J. Wysocki wrote:
> On Sat, Dec 3, 2022 at 3:32 PM Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 11/30/22 19:27, Rafael J. Wysocki wrote:
> >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 7c0dd57e562a..4bbbca85134b 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > > >          *   * Thermal pressure will impact all cpus in this perf domain
> > > >          *     equally.
> > > >          */
> > > > -       if (sched_energy_enabled()) {
> > > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > > >
> > > >                 rq->cpu_capacity_inverted = 0;
> > > >
> > > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > > -
> > > > -               for (; pd; pd = pd->next) {
> > > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > > +               for_each_active_policy_safe(policy, policy_n) {
> > >
> > > 1. Is the "safe" part sufficient for protection against concurrent
> > > deletion and freeing of list entries?  cpufreq driver removal can do
> > > that AFAICS.
> >
> > The freeing part is not safe probably.
> 
> Well, I don't even think that the traversal part is safe against
> concurrent removal of list entries (it is safe against removal of list
> entries in the loop itself).
> 
> list_for_each_entry_safe() assumes that n will always point to a valid
> list entry, but what if the entry pointed to by it is deleted by a
> concurrent thread?

Yeah I was being too hopeful here that safe does magic.

> 
> > I need to research this more. Do you
> > have issues against the exportation of this traversal in principle?
> >
> > Switching them to be RCU protected could be the best safe option, anything
> > against that too?
> 
> Not really, it just occurred to me that you may end up dealing with a
> corrupted list here.

Yes.

> 
> > I might not end up needing that. I need to dig more.
> >
> > > 2. For a casual reader of this code it may not be clear why cpufreq
> > > policies matter here.
> >
> > I'm looking for a way to traverse the list of capacities of the system and
> > know their related CPUs.
> 
> So why don't you mention this in a comment, for instance?

Sorry my bad. I realize my questions could have been clearer now.

> 
> > AFAICT this information already exists in the performance domains and
> > cpufreq_policy. Performance domains are conditional to energy model and
> > schedutil. So trying to switch to cpufreq_policy.
> >
> > Assuming this question wasn't a request to add a comment :-)
> 
> Yes, it was. :-)
> 
> That said though, this change makes the scheduler kind of depend on
> cpufreq which feels a bit like a corner cut TBH.

I share your sentiment; hence my request for feedback before I spend more time
on this. And apologies for not being clearer about my questions.

The deps is soft and it already exist in the form that performance domains,
schedutil, thermal pressure are all interacting with cpufreq subsystem already.

I can add a better comment for sure; if this ends up being the right thing to
do.

> I do realize that cpufreq happens to be maintaining a data structure
> that turns out to be useful here, but the reason why it is there has
> nothing to do with this code AFAICS.

The comments about needing to check for performance domains to detect capacity
inversion is not clear? Sorry I realize again now that all of this might look
out of the blue for you.

The issue I'm dealing with is that I want to know when thermal pressure has
gone too far that the capacity of the performance domain now is lower than
another domain in the system; aka capacity inversion.

The issue at hand is that when using uclamp_min; we need to consider thermal
pressure. But not unconditionally as we have corner cases like this one:

	p->util_avg = 10%
	p->uclamp_min = 90%

	capacity_orig_of(big) = 100%
	capacity_orig_of(med) = 60%

the task wants to run near the top perf level of the system; but if we take
thermal pressure blindly then we will fail to place the task on the big core
even though it's the most performant core. For example if

	capacity_orig_of(big) - thermal_pressure = 80%

then from uclamp_min hint perspective, the big core is fine and it's the best
placement we can get. The task is not getting what it gets, but 80% is better
than going down to 60% or lower.

On the other hand if

	p->util_avg = 10%
	p->uclamp_min = 60%

The task can be placed on a medium or big core. But if medium core is under
thermal pressure; then we do have the option to place it on big; and if we
ignore thermal pressure the task could miss a deadline because of this
unnecessary bad placement decision.

By knowing when we are in capacity inversion; we can ignore thermal pressure
for big cores until we hit the inversion point since then we know that we do
have another higher performance point in the system to consider.

We do update the capacity of the CPUs in this code. Hence why I added this
extra logic to detect the inversion here. And for that; the only way I found to
do it is by going via perf domain, but Dietmar made me realize it's dependent
on energy_model; so I'm trying to use cpufreq policies instead. If cpufreq is
not present, then there's no thermal pressure signals too; and this whole loop
should be optimized out by the compiler.

Hope this helps to clarify and not add more confusion! This logic is used in
the scheduler when deciding whether a task fits the CPU based on its util_avg
and uclamp values.

I wanted to check if it's alright to make these macros visible in cpufreq.h and
use them here. And check with the right locking rules. It seems converting the
list to be rcu protected is the right way forward.

Assuming Vincent (or you) don't tell me there's actually a better way to handle
all of that that I missed :-)


Thanks!

--
Qais Yousef
Vincent Guittot Dec. 6, 2022, 6:12 p.m. UTC | #9
On Mon, 5 Dec 2022 at 12:02, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 12/04/22 12:35, Vincent Guittot wrote:
> > On Sat, 3 Dec 2022 at 15:33, Qais Yousef <qyousef@layalina.io> wrote:
> > >
> > > On 12/02/22 15:57, Vincent Guittot wrote:
> > >
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > index 7c0dd57e562a..4bbbca85134b 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > > > >          *   * Thermal pressure will impact all cpus in this perf domain
> > > > >          *     equally.
> > > > >          */
> > > > > -       if (sched_energy_enabled()) {
> > > > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > > > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > > > >
> > > > >                 rq->cpu_capacity_inverted = 0;
> > > > >
> > > > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > > > -
> > > > > -               for (; pd; pd = pd->next) {
> > > > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > > > +               for_each_active_policy_safe(policy, policy_n) {
> > > >
> > > > So you are looping all cpufreq policy (and before the perf domain) in
> > > > the period load balance. That' really not something we should or want
> > > > to do
> > >
> > > Why is it not acceptable in the period load balance but acceptable in the hot
> > > wake up path in feec()? What's the difference?
> >
> > This patch loops on all cpufreq policy in sched softirq, how can this
> > be sane ? and not only in eas mode but also in the default asymmetric
>
> Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
> in the wake up path in feec()?

feec() should be considered as an exception not as the default rule.
Thing like above which loops for_each on external subsystem should be
prevented and the fact that feec loops all PDs doesn't means that we
can put that everywhere else

>
> AFAICT the number of cpufreq policies and perf domains have 1:1 mapping. In
> feec() we not only loop perf domains, but we go through each cpu of each domain
> to find max_spare_cap then compute_energy(). Which is more intensive.
>
> In worst case scenario where there's a perf domain for each cpu, then we'll
> loop through every CPU *and* compute energy its energy cost in the wake up
> path.
>
> Why it's deemed acceptable in wake up path which is more critical AFAIU but not
> period load balance which is expected to do more work? What am I missing?

Things like util_fits_cpu is not an EAS only feature although it's the
only user for now and this must remain usable by another part of the
scheduler that wants to check if a workload can fit on a cpu if
needed. But adding not scalable behavior makes it not usable by
default for anything else than Android big.LITTLE with 8 CPUs and 2/3
PDs.

As a summary the rule is to be scalable and don't assume that a
for_each loop on another framework can be reasonable thus the above is
not.

The fact that feec is not scalable and assumes that a system will not
have more than 8 cores and 2/3 PDs is a problem IMO because it
prevents other systems from even considering using it.

Also as mentioned before, the above will be called as soon as
sched_asym_cpucapacity is enabled even if eas and feec() is not used
so you extend the problem outside feec().

>
> > performance  one.
> >
> > This inverted detection doesn't look like the right way to fix your
> > problem IMO. That being said, i agree that I haven't made any other
> > proposal apart that I think that you should use a different rules for
> > task and for overutilized and part of your problem comes from this.
>
> We discussed this before; I need to revisit the thread but I can't see how
> overutilized different than task will fix the issue. They should be unified by
> design.

A unified design doesn't mean one function especially if that includes
to loop for_each_active_policy_safe() in sched softirq

uclamp_min should not be used to set an over utilized cpu  because it
doesn means that the cpu is overutilized and uclamp_max should be used
for not setting overutilized a cpu with a clamped max value

>
> I'm all ears if there's a simpler way to address the problem :-)

let me try to prepare a something to show what I mean

>
> The problem is that thermal pressure on big cpu is not important from
> uclamp perspective until it is in inversion state. It is quite common to have
> a system where the medium capacity is in 500 range. If the big is under thermal
> pressure that it drops to 800, then it is still a fitting CPU from uclamp
> perspective. Keep in mind uclamp_min is useful for tasks whose utilization is
> small So we need to be selective when thermal pressure is actually helping out
> or just creating unnecessary problems.

What about medium cores ?  There are often 3 capacity levels now so
the same can happen between medium and little.

Also we have more and more thermal capping happening ahead of
overheating like the power cap interface where we want to cap some
CPUs and others even before they reach their thermal limit in order to
get more power/thermal room for others. This implies that some CPUs
could be under thermal pressure whereas others not. This implies also
that a UC with big core capacity being below little and medium cores
is not impossible or medium being under little but big staying at high
capacity.

>
> The only other option I had in mind was to do the detection when we update the
> thermal_pressure in the topology code. But that didn't look better alternative
> to me.
>
> >
> > Then this make eas and util_fits_cpu even more Arm specific and I
>
> What is the Arm specific part about it? Why it wouldn't work on non-Arm
> systems?

Because it assume that for_each_cpufreq loop in not a problem mostly
because there will be no more than 3 CPUs and 2/3 PDs (ie Arm based
Android smartphone) and as a result, it's acceptable to loop
everything everytime

>
> > still hope to merge sched_asym_cpucapacity and asym_packing a some
> > levels because they looks  more and more similar but each side is
> > trying to add some SoC specific policy
>
> Oh, it seems Intel relies on asym_packing for their hybrid support approach?
> I think sched_asym_cpucapacity was designed to be generic. If I gathered
> correctly lack of support for SMT and inability to provide energy model outside
> of DT were some required extensions.

At least merging asym_packing and sched_asym_cpucapacity would be a
good starting point even if we don't consider using feec() in the 1st
step.

When we have things like below in the code in find_busiest_queue(), it
probably means that we have duplicated behavior :

if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
    !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
    nr_running == 1)
continue;

/* Make sure we only pull tasks from a CPU of lower priority */
if ((env->sd->flags & SD_ASYM_PACKING) &&
    sched_asym_prefer(i, env->dst_cpu) &&
    nr_running == 1)
continue;


They both try to find the cpu with highest capacity that will fit the
task requirement

>
> To be honest, I personally think EAS can be useful on SMP systems and it would
> be nice to enable it outside of sched_asym_cpucapacity.

or even before that task_fits_cpu should probably be useful for some smp case

>
> I'm interested to hear more about this unification idea actually. If you feel
> a bit chatty to describe in more detail how do you see this being unified, that
> could be enlightening for some of us who work in this area :-)

First, keep everything scalable and don't loop on all cpufreq policy
or anything else that can't scale so we don't have to take care of the
number of cpu of cpufreq policy in the system
Then, I think that a good starting point would be to merge the
behavior of SD_ASYM_CPUCAPACITY and SD_ASYM_PACKING instead of 2
parallel and similar behaviors
The misfit task could also be unified to integrate other things than
EAS like the IPC that ricardo is working one. I haven't look in
details how this could be doable


>
>
> Thanks!

>
> --
> Qais Yousef
Qais Yousef Dec. 8, 2022, 2:05 p.m. UTC | #10
On 12/06/22 19:12, Vincent Guittot wrote:
> On Mon, 5 Dec 2022 at 12:02, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 12/04/22 12:35, Vincent Guittot wrote:
> > > On Sat, 3 Dec 2022 at 15:33, Qais Yousef <qyousef@layalina.io> wrote:
> > > >
> > > > On 12/02/22 15:57, Vincent Guittot wrote:
> > > >
> > > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > > index 7c0dd57e562a..4bbbca85134b 100644
> > > > > > --- a/kernel/sched/fair.c
> > > > > > +++ b/kernel/sched/fair.c
> > > > > > @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > > > > >          *   * Thermal pressure will impact all cpus in this perf domain
> > > > > >          *     equally.
> > > > > >          */
> > > > > > -       if (sched_energy_enabled()) {
> > > > > > +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
> > > > > >                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> > > > > > -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> > > > > > +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
> > > > > >
> > > > > >                 rq->cpu_capacity_inverted = 0;
> > > > > >
> > > > > > -               SCHED_WARN_ON(!rcu_read_lock_held());
> > > > > > -
> > > > > > -               for (; pd; pd = pd->next) {
> > > > > > -                       struct cpumask *pd_span = perf_domain_span(pd);
> > > > > > +               for_each_active_policy_safe(policy, policy_n) {
> > > > >
> > > > > So you are looping all cpufreq policy (and before the perf domain) in
> > > > > the period load balance. That' really not something we should or want
> > > > > to do
> > > >
> > > > Why is it not acceptable in the period load balance but acceptable in the hot
> > > > wake up path in feec()? What's the difference?
> > >
> > > This patch loops on all cpufreq policy in sched softirq, how can this
> > > be sane ? and not only in eas mode but also in the default asymmetric
> >
> > Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
> > in the wake up path in feec()?
> 
> feec() should be considered as an exception not as the default rule.
> Thing like above which loops for_each on external subsystem should be
> prevented and the fact that feec loops all PDs doesn't means that we
> can put that everywhere else

Fair enough. But really understanding the root cause behind this limitation
will be very helpful. I don't have the same appreciation of why this is
a problem, and shedding more light will help me to think more about it in the
future.

I have a pinebook pro that contains 4 littles and 2 bigs. What kind of
experiment I can run to see the impact of a large loop here? Is trying with 32
or even 64 policies good? What type of workloads to try this with?

> 
> >
> > AFAICT the number of cpufreq policies and perf domains have 1:1 mapping. In
> > feec() we not only loop perf domains, but we go through each cpu of each domain
> > to find max_spare_cap then compute_energy(). Which is more intensive.
> >
> > In worst case scenario where there's a perf domain for each cpu, then we'll
> > loop through every CPU *and* compute energy its energy cost in the wake up
> > path.
> >
> > Why it's deemed acceptable in wake up path which is more critical AFAIU but not
> > period load balance which is expected to do more work? What am I missing?
> 
> Things like util_fits_cpu is not an EAS only feature although it's the
> only user for now and this must remain usable by another part of the
> scheduler that wants to check if a workload can fit on a cpu if
> needed. But adding not scalable behavior makes it not usable by
> default for anything else than Android big.LITTLE with 8 CPUs and 2/3
> PDs.

To be honest I don't see the scalability problem, and that's the problem :-)

As I said before; I'm not against moving to something else. I didn't think
scalability is a problem even outside 8 CPUs and 2/3 PDs.

> 
> As a summary the rule is to be scalable and don't assume that a
> for_each loop on another framework can be reasonable thus the above is
> not.
> 
> The fact that feec is not scalable and assumes that a system will not
> have more than 8 cores and 2/3 PDs is a problem IMO because it
> prevents other systems from even considering using it.
> 
> Also as mentioned before, the above will be called as soon as
> sched_asym_cpucapacity is enabled even if eas and feec() is not used
> so you extend the problem outside feec().

I am even more curious now to understand where we hit a limit.

The only assumption I'm maybe making here; and I think is inherit in these
type of systems, is that if your hardware is HMP, then power is concern for you
and not only performance.

That said; as I mentioned in my previous reply; I did suggest an alternative
and happy to consider something else. I think I can do something in topology to
help us making an easier decision here without making both sides ugly. It just
seemed unnecessary at the time given the simplicity of this approach.

> 
> >
> > > performance  one.
> > >
> > > This inverted detection doesn't look like the right way to fix your
> > > problem IMO. That being said, i agree that I haven't made any other
> > > proposal apart that I think that you should use a different rules for
> > > task and for overutilized and part of your problem comes from this.
> >
> > We discussed this before; I need to revisit the thread but I can't see how
> > overutilized different than task will fix the issue. They should be unified by
> > design.
> 
> A unified design doesn't mean one function especially if that includes
> to loop for_each_active_policy_safe() in sched softirq

This only makes sense to me if you refer to death by thousand cuts kind of
problem. I can't see how this on its own being a problem.. I'll run some tests
to convince myself.

> 
> uclamp_min should not be used to set an over utilized cpu  because it

uclamp_min must set overutilized. If a long running task has its uclamp_min
changed, we want to upmigrate it if that's necessary to meet the new demand.

Keep in mind in android tasks could move between top-app, foreground and
background groups.

And keep in mind Android now implement a framework to dynamically change
uclamp. It's available from Android 12 as part of Android Dynamic Performance
Framework (ADPF). It's under CPU Hints.

	https://developer.android.com/games/optimize/adpf

> doesn means that the cpu is overutilized and uclamp_max should be used

It is a misfit task; which requires overutilized to be set to re-enable load
balance for uclamp_min to upgrate it. For uclamp max we should not set
overutilized, agreed and that's what we should be doing.

> for not setting overutilized a cpu with a clamped max value
> 
> >
> > I'm all ears if there's a simpler way to address the problem :-)
> 
> let me try to prepare a something to show what I mean

A PoC just to help me see what you mean would be great, thanks!

> 
> >
> > The problem is that thermal pressure on big cpu is not important from
> > uclamp perspective until it is in inversion state. It is quite common to have
> > a system where the medium capacity is in 500 range. If the big is under thermal
> > pressure that it drops to 800, then it is still a fitting CPU from uclamp
> > perspective. Keep in mind uclamp_min is useful for tasks whose utilization is
> > small So we need to be selective when thermal pressure is actually helping out
> > or just creating unnecessary problems.
> 
> What about medium cores ?  There are often 3 capacity levels now so
> the same can happen between medium and little.

We do subtract thermal pressure from capacity for the mediums and littles all
the time. So that case is handled, no?

> 
> Also we have more and more thermal capping happening ahead of
> overheating like the power cap interface where we want to cap some
> CPUs and others even before they reach their thermal limit in order to
> get more power/thermal room for others. This implies that some CPUs
> could be under thermal pressure whereas others not. This implies also
> that a UC with big core capacity being below little and medium cores
> is not impossible or medium being under little but big staying at high
> capacity.

I *think* I am catering for this already. The big is special because it's the
highest level, so thermal pressure; from uclamp perspective, is not meaningful
as it's the best you can ever get on this system; untill it becomes capacity
inverted that is.

Getting this not to break easily is really important for the successful
adoption of this hint.

> 
> >
> > The only other option I had in mind was to do the detection when we update the
> > thermal_pressure in the topology code. But that didn't look better alternative
> > to me.
> >
> > >
> > > Then this make eas and util_fits_cpu even more Arm specific and I
> >
> > What is the Arm specific part about it? Why it wouldn't work on non-Arm
> > systems?
> 
> Because it assume that for_each_cpufreq loop in not a problem mostly
> because there will be no more than 3 CPUs and 2/3 PDs (ie Arm based
> Android smartphone) and as a result, it's acceptable to loop
> everything everytime

I did not make this assumption. I genuinely thought (and still think to be
honest) that it's not a scalability issue. But of course I appreciate your
knowledge and experience when you say you think this can be a problem.

I only experienced the issue of too many cgroups causing newidle_cpu() to take
a long time; if you remember that issue we had with Joel.

If any assumptions I made here it is that someone creating HMP systems it is
not for a massive server market where we can have 100s of cores. I probably
have assumed HMP = average consumer market that wants power efficient devices
and if we go crazy we can go to 32 cpus or something. Which I didn't think will
be a problem even if each cpus has its own PD.

> 
> >
> > > still hope to merge sched_asym_cpucapacity and asym_packing a some
> > > levels because they looks  more and more similar but each side is
> > > trying to add some SoC specific policy
> >
> > Oh, it seems Intel relies on asym_packing for their hybrid support approach?
> > I think sched_asym_cpucapacity was designed to be generic. If I gathered
> > correctly lack of support for SMT and inability to provide energy model outside
> > of DT were some required extensions.
> 
> At least merging asym_packing and sched_asym_cpucapacity would be a
> good starting point even if we don't consider using feec() in the 1st
> step.
> 
> When we have things like below in the code in find_busiest_queue(), it
> probably means that we have duplicated behavior :
> 
> if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
>     !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
>     nr_running == 1)
> continue;
> 
> /* Make sure we only pull tasks from a CPU of lower priority */
> if ((env->sd->flags & SD_ASYM_PACKING) &&
>     sched_asym_prefer(i, env->dst_cpu) &&
>     nr_running == 1)
> continue;

I see.

Just to make sure, this is only used in LB only, right?

I do have a problem that might make this more complicated.

I'm considering if we can enable packing in load_balance by using feec().
Limiting the search space to only idle CPUs causes problems. And these problems
relate to both performance and power.

For example, If a small task is running on a big CPU, it will prevent big-ish
tasks from migrating there on load balance if the only idle cpu is a little
one.

And if a small task moves to a big cpu on load balance because it's the only
idle one, it could take the cluster out of deep sleep while it could have
easily crammed on a little core.

I can't say I've debugged these problems. But we do get lots of issues related
to big cpus waking up unnecessarily (it could be per-cpu tasks too); and task
not migrating fast enough.

I think the load balance behavior of going with idle cpus first is
contributing. I hope to dig more into this problem in the near future.

Can we consider these problems along the way too?

> They both try to find the cpu with highest capacity that will fit the
> task requirement

find the *idl* cpu with highest capacity, right? IIUC, we try to find idle cpu
then use load if all is busy? Sorry I don't have enough of the load balance
code in my head yet..

> 
> >
> > To be honest, I personally think EAS can be useful on SMP systems and it would
> > be nice to enable it outside of sched_asym_cpucapacity.
> 
> or even before that task_fits_cpu should probably be useful for some smp case

I think that should we doable even with the current approach if inversion
detection is not actually the problem and just the way we do it.

I chose the current approach because it seemed the simpler to me. I can do
something in the topology code to help make the inversion detection much easier
and more generic. But I'll hold on a bit in case you have a thought on another
way to tackle the problem.

We do have actually a similar problem for RT that I need to tackle too. So
maybe moving it to topology will help both CFS and RT.

> 
> >
> > I'm interested to hear more about this unification idea actually. If you feel
> > a bit chatty to describe in more detail how do you see this being unified, that
> > could be enlightening for some of us who work in this area :-)
> 
> First, keep everything scalable and don't loop on all cpufreq policy
> or anything else that can't scale so we don't have to take care of the
> number of cpu of cpufreq policy in the system
> Then, I think that a good starting point would be to merge the

If your issue just with that, and not inversion detection in particular,
I think I can find an alternative approach. Though for education purposes I'd
like to make sure to understand how/when this scalability problem manifests.

> behavior of SD_ASYM_CPUCAPACITY and SD_ASYM_PACKING instead of 2
> parallel and similar behaviors

To be clear, you mean load balance path right? If yes, then I'd like us to
consider how we can cater for some of the potential issues I mentioned above.

> The misfit task could also be unified to integrate other things than
> EAS like the IPC that ricardo is working one. I haven't look in
> details how this could be doable

To make things more complicated, to increase the effectiveness of uclamp_max,
we need a 'reverse misfit' behavior (terminology credit to Morten).

If a busy long running task has its uclamp_max changed such that we can put it
on a more energy efficient core, we want to extend misfit to handle down
migrations too. We relied on tasks going to sleep and feec() automatically
doing the right thing next wake up. With uclamp_max the next wake up might not
happen for a while.


Thanks!!

--
Qais Yousef
Qais Yousef Dec. 12, 2022, 6:43 p.m. UTC | #11
On 12/09/22 17:47, Vincent Guittot wrote:

[...]

> > > > > This patch loops on all cpufreq policy in sched softirq, how can this
> > > > > be sane ? and not only in eas mode but also in the default asymmetric
> > > >
> > > > Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
> > > > in the wake up path in feec()?
> > > 
> > > feec() should be considered as an exception not as the default rule.
> > > Thing like above which loops for_each on external subsystem should be
> > > prevented and the fact that feec loops all PDs doesn't means that we
> > > can put that everywhere else
> > 
> > Fair enough. But really understanding the root cause behind this limitation
> > will be very helpful. I don't have the same appreciation of why this is
> > a problem, and shedding more light will help me to think more about it in the
> > future.
> > 
> 
> Take the example of 1k cores with per cpu policy. Do you really think a
> for_each_cpufreq_policy would be reasonable ?

Hmm I don't think such an HMP system makes sense to ever exist.

That system has to be a multi-socket system and I doubt inversion detection is
something of value.

Point taken anyway. Let's find another way to do this.

[...]

> > This only makes sense to me if you refer to death by thousand cuts kind of
> > problem. I can't see how this on its own being a problem.. I'll run some tests
> > to convince myself.
> 
> That's exactly the point, the scheduler tries hard to not add any system size
> related limitation like: it's fine because it's only for 8 cores system.
> I don't want to lock up the uclamp/util_fits_cpu in such limitations so you
> can't use for_each_cpufreq or anything similar at runtime in sched softirq

I'm happy to make things more generic so more users can benefit :-)

> 
> > 
> > > 
> > > uclamp_min should not be used to set an over utilized cpu  because it
> > 
> > uclamp_min must set overutilized. If a long running task has its uclamp_min
> > changed, we want to upmigrate it if that's necessary to meet the new demand.
> 
> I think that you are too much focused on your 8 cores android system.

I am not; you're making an assumption here :-)

HMP systems for 1k servers just don't make any sense. A desktop with 128 or
even 256 HMP cores is a big stretch; and if that exist I don't think there's an
overhead to worry about here; and I *did* consider this. I measured the impact
if we have 128 and it was mere 1 or 2 us extra.  And that's on under powered
pine book pro. If such a system exist it'd probably be more performant.

> uclamp_min must not set a CPU overutilized because the CPU is not overutilized
> in this case. It's only the task that is misfit. You mostly try to bias some
> behavior to fit your use case.

Maybe we are talking about different things over here. As long as we agree it's
a misfit task then we are aligned.

As far as I know misfit required overutilized to re-enable load balance. But
maybe there's a detail that's creating this confusion.

> 
> > 
> > Keep in mind in android tasks could move between top-app, foreground and
> > background groups.
> > 
> > And keep in mind Android now implement a framework to dynamically change
> > uclamp. It's available from Android 12 as part of Android Dynamic Performance
> > Framework (ADPF). It's under CPU Hints.
> > 
> > 	https://developer.android.com/games/optimize/adpf
> > 
> 
> I have never put any kind of limitation of the task or system behavior

Nope you didn't. I was just trying to highlight some additional problems we're
seeing that we'd need to consider :-)

> 
> > > doesn means that the cpu is overutilized and uclamp_max should be used
> > 
> > It is a misfit task; which requires overutilized to be set to re-enable load
> > balance for uclamp_min to upgrate it. For uclamp max we should not set
> > overutilized, agreed and that's what we should be doing.
> 
> That's probably the root of your problem here. The load balance is still
> periodically called even when EAS is enabled but the latter prevents task
> migration unless overutilized in order to not break the energy aware task

Okay. For me this means load_balance is disabled since it's effectively doing
nothing. So maybe it's a terminology problem of what I meant with load balance
is disabled.

> placement. But if a task is misplaced and a cpu finds it can help,
> we should let it pull the task without disabling EAS. This will not enable the
> performance spread behavior and we can expect the other small tasks to be
> packed by EAS on the best cpu at next wakeup.
> 
> So intead of trying to detect a very specific capacity inversion use case
> during this periodic this load balance, it's better to let a CPU that can
> fix the misfit situation to pull the task.

I can't see the relation here.

Capacity inversion is required NOT in load balance. In every place will look
for a fitting CPU; we need to ensure hints to place the task on big cpu works
even under thermal pressure.

If uclamp_min = 1024, but big core has 5% thermal pressure, it should still fit
there. And feec() a long with all other callers should respect that too.

How better overutilized detection helps here? EAS needs to consider this as
a candidate CPU and place it there at wake up.

> 
> See below
> 
> > 
> > > for not setting overutilized a cpu with a clamped max value
> > > 
> > > >
> > > > I'm all ears if there's a simpler way to address the problem :-)
> > > 
> > > let me try to prepare a something to show what I mean
> > 
> > A PoC just to help me see what you mean would be great, thanks!
> > 
> 
> I have reverted patches:
> Revert "sched/fair: Detect capacity inversion"
> Revert "sched/fair: Consider capacity inversion in util_fits_cpu()"
> Revert "sched/uclamp: Make cpu_overutilized() use util_fits_cpu()"
> 
> ---
>  kernel/sched/fair.c | 32 +++++++++++++++++---------------
>  1 file changed, 17 insertions(+), 15 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4423681baf15..6e54afc0a6ec 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4539,7 +4539,7 @@ static inline int util_fits_cpu(unsigned long util,
>  	 * the time.
>  	 */
>  	capacity_orig = capacity_orig_of(cpu);
> -	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> +	capacity_orig_thermal = capacity_orig - thermal_load_avg(cpu_rq(cpu));
> 
>  	/*
>  	 * We want to force a task to fit a cpu as implied by uclamp_max.
> @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
>  	 *     2. The system is being saturated when we're operating near
>  	 *        max capacity, it doesn't make sense to block overutilized.
>  	 */
> -	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> -	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> +	uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
>  	fits = fits || uclamp_max_fits;
> 
>  	/*
> @@ -4614,7 +4613,7 @@ static inline int util_fits_cpu(unsigned long util,
>  	 * handle the case uclamp_min > uclamp_max.
>  	 */
>  	uclamp_min = min(uclamp_min, uclamp_max);
> -	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> +	if (util < uclamp_min)
>  		fits = fits && (uclamp_min <= capacity_orig_thermal);
> 
>  	return fits;
> @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
>  #ifdef CONFIG_SMP
>  static inline bool cpu_overutilized(int cpu)
>  {
> -	return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> +	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> +
> +	return !(fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)) ||
> +		 rq_util_max <= capacity_orig_of(cpu));
>  }
> 
>  static inline void update_overutilized_status(struct rq *rq)
> @@ -10164,24 +10166,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  	 */
>  	update_sd_lb_stats(env, &sds);
> 
> -	if (sched_energy_enabled()) {
> -		struct root_domain *rd = env->dst_rq->rd;
> -
> -		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> -			goto out_balanced;
> -	}
> -
> -	local = &sds.local_stat;
> -	busiest = &sds.busiest_stat;
> -
>  	/* There is no busy sibling group to pull tasks from */
>  	if (!sds.busiest)
>  		goto out_balanced;
> 
> +	busiest = &sds.busiest_stat;
> +
>  	/* Misfit tasks should be dealt with regardless of the avg load */
>  	if (busiest->group_type == group_misfit_task)
>  		goto force_balance;
> 
> +	if (sched_energy_enabled()) {
> +		struct root_domain *rd = env->dst_rq->rd;
> +
> +		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> +			goto out_balanced;
> +	}
> +
>  	/* ASYM feature bypasses nice load balance check */
>  	if (busiest->group_type == group_asym_packing)
>  		goto force_balance;
> @@ -10194,6 +10195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  	if (busiest->group_type == group_imbalanced)
>  		goto force_balance;
> 
> +	local = &sds.local_stat;
>  	/*
>  	 * If the local group is busier than the selected busiest group
>  	 * don't try and pull any tasks.
> --
> 2.17.1
> 
> There are probably some other changes to do here and here like check_misfit_status
> but this enable another cpu with more capacity to pull a misfit task without
> disabling EAS.

I need time to digest this. But so far I see this is still irrelevant as the
problem with the definition of what *fits* a CPU and what doesn't.

On a system where:

	LITTLE=160, MEDIUM=480, BIG=1024

A task

	p->util_avg = 200
	p->util_min = 1024

Should always be placed on a big core unless the big core is capacity inverted.
That means most importantly in feec().

How does the above fix that?

This decision should be taken coherently in feec(), select_idle_capacity() and
any place that wants to check if the task fits the cpu or not.

I'm not sure why you're isolating this to the overutilized. It is just one of
the call sites that needs to check for fitness when uclamp is being used.

> 
> > > 
> > > >
> > > > The problem is that thermal pressure on big cpu is not important from
> > > > uclamp perspective until it is in inversion state. It is quite common to have
> > > > a system where the medium capacity is in 500 range. If the big is under thermal
> > > > pressure that it drops to 800, then it is still a fitting CPU from uclamp
> > > > perspective. Keep in mind uclamp_min is useful for tasks whose utilization is
> > > > small So we need to be selective when thermal pressure is actually helping out
> > > > or just creating unnecessary problems.
> > > 
> > > What about medium cores ?  There are often 3 capacity levels now so
> > > the same can happen between medium and little.
> > 
> > We do subtract thermal pressure from capacity for the mediums and littles all
> > the time. So that case is handled, no?
> 
> you don't need iversion detection for medium vs little ? Why it's acceptable here
> and not between bg vs medium

Because it's the highest level. You have no option to do better. uclamp_min is
a hint; if the util_avg is small (ie: it still fits_capacity()); best effort is
to keep giving the task the performance it needs. While on a big core; you
can't upmigrate to any other level; so thermal pressure on big core is
meaningless until it is in capacity inversion.

If the user asks for something and finds that their task is placed randomly
that's not good, no?

What you say makes sense against util_avg; the rule of which is still the same
as before.

> 
> >>
> > 
> > > 
> > > Also we have more and more thermal capping happening ahead of
> > > overheating like the power cap interface where we want to cap some
> > > CPUs and others even before they reach their thermal limit in order to
> > > get more power/thermal room for others. This implies that some CPUs
> > > could be under thermal pressure whereas others not. This implies also
> > > that a UC with big core capacity being below little and medium cores
> > > is not impossible or medium being under little but big staying at high
> > > capacity.
> > 
> > I *think* I am catering for this already. The big is special because it's the
> > highest level, so thermal pressure; from uclamp perspective, is not meaningful
> > as it's the best you can ever get on this system; untill it becomes capacity
> > inverted that is.
> > 
> 
> That's another point that concerns me, you put some special condition on big
> cores to be able to detect an possible inversion.
> If a task deosn't fit a cpu because of thermal pressure, it should be tagged misfit
> and let other cpu looks if they can do better. That's doesnt mean that we
> will find one.

This breaks the usefulness of the hint. What you say means a small task that
wants to run as fast as possible will find itself placed randomly instead
whenever there's thermal pressure. And userspace should just figure this out
on their own somehow that thermal pressure affects them and they start to find
a way to read the thermal pressure and set the right uclamp_min value??

As I tried to highlight; this will be set dynamically based on perceived
performance. Even for a static configuration; I don't think a hint of
uclamp_min = 1024 should fail that easily because some cores can be very big
and always have a residual 1 or 2 % thermal pressure.

[...]

> > 
> > I see.
> > 
> > Just to make sure, this is only used in LB only, right?
> 
> Yes, although I haven't check the IPC patchset
> 
> > 
> > I do have a problem that might make this more complicated.
> > 
> > I'm considering if we can enable packing in load_balance by using feec().
> > Limiting the search space to only idle CPUs causes problems. And these problems
> > relate to both performance and power.
> > 
> > For example, If a small task is running on a big CPU, it will prevent big-ish
> > tasks from migrating there on load balance if the only idle cpu is a little
> > one.
> > 
> > And if a small task moves to a big cpu on load balance because it's the only
> > idle one, it could take the cluster out of deep sleep while it could have
> 
> Maybe your root problem is that the small task migrates on a big cpu only
> because it's the only idle one and not because of utilization (uclamp_min/max
> included)

Yes I think that's the reason too and that's why I'm suggesting maybe we need
to consider packing for systems that care about power.

> 
> > easily crammed on a little core.
> > 
> > I can't say I've debugged these problems. But we do get lots of issues related
> > to big cpus waking up unnecessarily (it could be per-cpu tasks too); and task
> > not migrating fast enough.
> > 
> > I think the load balance behavior of going with idle cpus first is
> > contributing. I hope to dig more into this problem in the near future.
> 
> the load balance is not going with idle cpu 1st, the load balance happens on
> all cpus but the period of busy cpus is higher to not impact much the running
> load whereas idle cpus don't have anything else to do except lokking for work
> to pull and saving power

Okay the impact is the same. We pick idle cpus first; and
select_idle_capacity() which is what we should go through for HMP only search
for CPUs that are idle.

> 
> > 
> > Can we consider these problems along the way too?
> > 
> > > They both try to find the cpu with highest capacity that will fit the
> > > task requirement
> > 
> > find the *idl* cpu with highest capacity, right? IIUC, we try to find idle cpu
> > then use load if all is busy? Sorry I don't have enough of the load balance
> > code in my head yet..
> > 
> > > 
> > > >
> > > > To be honest, I personally think EAS can be useful on SMP systems and it would
> > > > be nice to enable it outside of sched_asym_cpucapacity.
> > > 
> > > or even before that task_fits_cpu should probably be useful for some smp case
> > 
> > I think that should we doable even with the current approach if inversion
> > detection is not actually the problem and just the way we do it.
> > 
> > I chose the current approach because it seemed the simpler to me. I can do
> > something in the topology code to help make the inversion detection much easier
> > and more generic. But I'll hold on a bit in case you have a thought on another
> > way to tackle the problem.
> > 
> > We do have actually a similar problem for RT that I need to tackle too. So
> > maybe moving it to topology will help both CFS and RT.
> > 
> > > 
> > > >
> > > > I'm interested to hear more about this unification idea actually. If you feel
> > > > a bit chatty to describe in more detail how do you see this being unified, that
> > > > could be enlightening for some of us who work in this area :-)
> > > 
> > > First, keep everything scalable and don't loop on all cpufreq policy
> > > or anything else that can't scale so we don't have to take care of the
> > > number of cpu of cpufreq policy in the system
> > > Then, I think that a good starting point would be to merge the
> > 
> > If your issue just with that, and not inversion detection in particular,
> 
> Looping on all cpufreq policies is the main problem but i also think that capacity
> inversion is specific to your system and can be replaced by a generic solution
> that will not make any assumption on any CPUs big vs little or 
> shared vs per cpu dvfs

It is not specific to 'my' system. It is a generic problem to any HMP based
system. I can understand issues with the implementation. But I can't
understand how you think taking thermal pressure unconditionally isn't
a problem. This unconditional consideration of thermal pressure is my problem.


Cheers

--
Qais Yousef

> 
> 
> > I think I can find an alternative approach. Though for education purposes I'd
> > like to make sure to understand how/when this scalability problem manifests.
> > 
> > > behavior of SD_ASYM_CPUCAPACITY and SD_ASYM_PACKING instead of 2
> > > parallel and similar behaviors
> > 
> > To be clear, you mean load balance path right? If yes, then I'd like us to
> > consider how we can cater for some of the potential issues I mentioned above.
> > 
> > > The misfit task could also be unified to integrate other things than
> > > EAS like the IPC that ricardo is working one. I haven't look in
> > > details how this could be doable
> > 
> > To make things more complicated, to increase the effectiveness of uclamp_max,
> > we need a 'reverse misfit' behavior (terminology credit to Morten).
> > 
> > If a busy long running task has its uclamp_max changed such that we can put it
> > on a more energy efficient core, we want to extend misfit to handle down
> > migrations too. We relied on tasks going to sleep and feec() automatically
> > doing the right thing next wake up. With uclamp_max the next wake up might not
> > happen for a while.
> > 
> > 
> > Thanks!!
> > 
> > --
> > Qais Yousef
Dietmar Eggemann Dec. 13, 2022, 5:38 p.m. UTC | #12
On 12/12/2022 19:43, Qais Yousef wrote:
> On 12/09/22 17:47, Vincent Guittot wrote:

[...]

> HMP systems for 1k servers just don't make any sense. A desktop with 128 or
> even 256 HMP cores is a big stretch; and if that exist I don't think there's an
> overhead to worry about here; and I *did* consider this. I measured the impact
> if we have 128 and it was mere 1 or 2 us extra.  And that's on under powered
> pine book pro. If such a system exist it'd probably be more performant.
> 
>> uclamp_min must not set a CPU overutilized because the CPU is not overutilized
>> in this case. It's only the task that is misfit. You mostly try to bias some
>> behavior to fit your use case.
> 
> Maybe we are talking about different things over here. As long as we agree it's
> a misfit task then we are aligned.

IMHO, utilization is about the running task and uclamp is maintained
taking the runnable tasks into consideration as well. Maybe that's the
source of the different views here?

> As far as I know misfit required overutilized to re-enable load balance. But
> maybe there's a detail that's creating this confusion.

I think that Vincent is suggesting to let MF load balance happening even
in !OverUtilized (OU). We gather the necessary load-balance statistics
already today in !OU so it is easily to do.

>>>> doesn means that the cpu is overutilized and uclamp_max should be used
>>>
>>> It is a misfit task; which requires overutilized to be set to re-enable load
>>> balance for uclamp_min to upgrate it. For uclamp max we should not set
>>> overutilized, agreed and that's what we should be doing.
>>
>> That's probably the root of your problem here. The load balance is still
>> periodically called even when EAS is enabled but the latter prevents task
>> migration unless overutilized in order to not break the energy aware task
> 
> Okay. For me this means load_balance is disabled since it's effectively doing
> nothing. So maybe it's a terminology problem of what I meant with load balance
> is disabled.
> 
>> placement. But if a task is misplaced and a cpu finds it can help,
>> we should let it pull the task without disabling EAS. This will not enable the
>> performance spread behavior and we can expect the other small tasks to be
>> packed by EAS on the best cpu at next wakeup.
>>
>> So intead of trying to detect a very specific capacity inversion use case
>> during this periodic this load balance, it's better to let a CPU that can
>> fix the misfit situation to pull the task.
> 
> I can't see the relation here.

To me it looks like that there are 2 stories here:

(A) Why do we need `CPU capacity inversion` (CapInv)? What do we gain
    by changing `cap = capacity_orig - arch_scale_thermal_pressure()`
    (i) to `cap = capacity_orig - thermal_load_avg()` (ii)? Do we want
    to avoid a CPU as a possible target in cases where pressure is
    already back to 0 and we still have a thermal PELT signal > 0
    because of it's historic information? But this is then only
    considering the decaying side of the thermal PELT signal.

    Vincent replaces (i) by (ii) so no need to switch to (ii) for
    CapInv.

(B) Don't use util_fits_cpu() in cpu_overutilized() and replace this by
    MF load balance in !OU.

[...]

>> I have reverted patches:
>> Revert "sched/fair: Detect capacity inversion"
>> Revert "sched/fair: Consider capacity inversion in util_fits_cpu()"
>> Revert "sched/uclamp: Make cpu_overutilized() use util_fits_cpu()"
>>
>> ---
>>  kernel/sched/fair.c | 32 +++++++++++++++++---------------
>>  1 file changed, 17 insertions(+), 15 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 4423681baf15..6e54afc0a6ec 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4539,7 +4539,7 @@ static inline int util_fits_cpu(unsigned long util,
>>  	 * the time.
>>  	 */
>>  	capacity_orig = capacity_orig_of(cpu);
>> -	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>> +	capacity_orig_thermal = capacity_orig - thermal_load_avg(cpu_rq(cpu));
>>
>>  	/*
>>  	 * We want to force a task to fit a cpu as implied by uclamp_max.
>> @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
>>  	 *     2. The system is being saturated when we're operating near
>>  	 *        max capacity, it doesn't make sense to block overutilized.
>>  	 */
>> -	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
>> -	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
>> +	uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
>>  	fits = fits || uclamp_max_fits;
>>
>>  	/*
>> @@ -4614,7 +4613,7 @@ static inline int util_fits_cpu(unsigned long util,
>>  	 * handle the case uclamp_min > uclamp_max.
>>  	 */
>>  	uclamp_min = min(uclamp_min, uclamp_max);
>> -	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
>> +	if (util < uclamp_min)
>>  		fits = fits && (uclamp_min <= capacity_orig_thermal);
>>
>>  	return fits;
>> @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
>>  #ifdef CONFIG_SMP
>>  static inline bool cpu_overutilized(int cpu)
>>  {
>> -	return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
>> +	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
>> +
>> +	return !(fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)) ||
>> +		 rq_util_max <= capacity_orig_of(cpu));
>>  }
>>
>>  static inline void update_overutilized_status(struct rq *rq)
>> @@ -10164,24 +10166,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>>  	 */
>>  	update_sd_lb_stats(env, &sds);
>>
>> -	if (sched_energy_enabled()) {
>> -		struct root_domain *rd = env->dst_rq->rd;
>> -
>> -		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
>> -			goto out_balanced;
>> -	}
>> -
>> -	local = &sds.local_stat;
>> -	busiest = &sds.busiest_stat;
>> -
>>  	/* There is no busy sibling group to pull tasks from */
>>  	if (!sds.busiest)
>>  		goto out_balanced;
>>
>> +	busiest = &sds.busiest_stat;
>> +
>>  	/* Misfit tasks should be dealt with regardless of the avg load */
>>  	if (busiest->group_type == group_misfit_task)
>>  		goto force_balance;
>>
>> +	if (sched_energy_enabled()) {
>> +		struct root_domain *rd = env->dst_rq->rd;
>> +
>> +		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
>> +			goto out_balanced;
>> +	}
>> +
>>  	/* ASYM feature bypasses nice load balance check */
>>  	if (busiest->group_type == group_asym_packing)
>>  		goto force_balance;
>> @@ -10194,6 +10195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>>  	if (busiest->group_type == group_imbalanced)
>>  		goto force_balance;
>>
>> +	local = &sds.local_stat;
>>  	/*
>>  	 * If the local group is busier than the selected busiest group
>>  	 * don't try and pull any tasks.
>> --
>> 2.17.1

Specific questions regarding this patch proposal so I can fully
understand the intention:

I see 4 major changes here:

(1) Remove `CPU capacity inversion` (CapInv)

(2) Don't use util_fits_cpu() in cpu_overutilized()

(3) Use `capacity = capacity_orig - thermal_load_avg() (ii)` (and not `-
    arch_scale_thermal_pressure() (i)` in util_fits_cpu(), i.e. in
    sis(), feec(), MF handling (update + load balance (lb))

(4) Allow MF lb in !OverUtilized (OU)

(1) CapInv wouldn't be needed in case only (ii) is used (so we don't
have to switch from (i) to (ii) (because of 3)?

(2) is there since we don't have to raise OU anymore when `rq_util_min >
capacity_orig_thermal`. This is replaced by MF handling in !OU lb (4)?

[...]
Lukasz Luba Dec. 13, 2022, 5:42 p.m. UTC | #13
Hi Qais,

I thought I could help with this issue.

On 12/12/22 18:43, Qais Yousef wrote:
> On 12/09/22 17:47, Vincent Guittot wrote:
> 
> [...]
> 
>>>>>> This patch loops on all cpufreq policy in sched softirq, how can this
>>>>>> be sane ? and not only in eas mode but also in the default asymmetric
>>>>>
>>>>> Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
>>>>> in the wake up path in feec()?
>>>>
>>>> feec() should be considered as an exception not as the default rule.
>>>> Thing like above which loops for_each on external subsystem should be
>>>> prevented and the fact that feec loops all PDs doesn't means that we
>>>> can put that everywhere else
>>>
>>> Fair enough. But really understanding the root cause behind this limitation
>>> will be very helpful. I don't have the same appreciation of why this is
>>> a problem, and shedding more light will help me to think more about it in the
>>> future.
>>>
>>
>> Take the example of 1k cores with per cpu policy. Do you really think a
>> for_each_cpufreq_policy would be reasonable ?
> 
> Hmm I don't think such an HMP system makes sense to ever exist.
> 
> That system has to be a multi-socket system and I doubt inversion detection is
> something of value.
> 
> Point taken anyway. Let's find another way to do this.
> 

Another way might be to use the 'update' code path, which sets this
information source, for the thermal pressure. That code path isn't as
hot as this in the task scheduler. Furthermore, we would also
have time and handle properly CPU hotplug callbacks there.

So something like this, I have in mind:

------------------------------8<-----------------------------
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index e7d6e6657ffa..7f372a93e21b 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -16,6 +16,7 @@
  #include <linux/sched/topology.h>
  #include <linux/cpuset.h>
  #include <linux/cpumask.h>
+#include <linux/mutex.h>
  #include <linux/init.h>
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
@@ -153,6 +154,33 @@ void topology_set_freq_scale(const struct cpumask 
*cpus, unsigned long cur_freq,
  DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
  DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
  EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale);

+static struct cpumask highest_capacity_mask;

+static struct cpumask highest_capacity_mask;
+static unsigned int max_possible_capacity;
+static DEFINE_MUTEX(max_capacity_lock);
+
+static void max_capacity_update(const struct cpumask *cpus,
+                               unsigned long capacity)
+{
+       mutex_lock(&max_capacity_lock);
+
+       if (max_possible_capacity < capacity) {
+               max_possible_capacity = capacity;
+
+               cpumask_clear(&highest_capacity_mask);
+
+               cpumask_or(&highest_capacity_mask,
+                          &highest_capacity_mask, cpus);
+       }
+
+       mutex_unlock(&max_capacity_lock);
+}
+
+bool topology_test_max_cpu_capacity(unsigned int cpu)
+{
+       return cpumask_test_cpu(cpu, &highest_capacity_mask);
+}
+EXPORT_SYMBOL_GPL(topology_test_max_cpu_capacity);
+
  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
  {
         per_cpu(cpu_scale, cpu) = capacity;
@@ -203,6 +231,8 @@ void topology_update_thermal_pressure(const struct 
cpumask *cpus,

         for_each_cpu(cpu, cpus)
                 WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+
+       max_capacity_update(cpus, capacity);
  }
  EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);


--------------------------->8--------------------------------

We could use the RCU if there is a potential to read racy date
while the updater modifies the mask in the meantime. Mutex is to
serialize the thermal writers which might be kicked for two
policies at the same time.

If you like I can develop and test such code in the arch_topology.c

Regards,
Lukasz
Vincent Guittot Dec. 15, 2022, 5:39 p.m. UTC | #14
On Mon, 12 Dec 2022 at 19:43, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 12/09/22 17:47, Vincent Guittot wrote:
>
> [...]
>
> > > > > > This patch loops on all cpufreq policy in sched softirq, how can this
> > > > > > be sane ? and not only in eas mode but also in the default asymmetric
> > > > >
> > > > > Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
> > > > > in the wake up path in feec()?
> > > >
> > > > feec() should be considered as an exception not as the default rule.
> > > > Thing like above which loops for_each on external subsystem should be
> > > > prevented and the fact that feec loops all PDs doesn't means that we
> > > > can put that everywhere else
> > >
> > > Fair enough. But really understanding the root cause behind this limitation
> > > will be very helpful. I don't have the same appreciation of why this is
> > > a problem, and shedding more light will help me to think more about it in the
> > > future.
> > >
> >
> > Take the example of 1k cores with per cpu policy. Do you really think a
> > for_each_cpufreq_policy would be reasonable ?
>
> Hmm I don't think such an HMP system makes sense to ever exist.
>
> That system has to be a multi-socket system and I doubt inversion detection is
> something of value.
>
> Point taken anyway. Let's find another way to do this.
>
> [...]
>
> > > This only makes sense to me if you refer to death by thousand cuts kind of
> > > problem. I can't see how this on its own being a problem.. I'll run some tests
> > > to convince myself.
> >
> > That's exactly the point, the scheduler tries hard to not add any system size
> > related limitation like: it's fine because it's only for 8 cores system.
> > I don't want to lock up the uclamp/util_fits_cpu in such limitations so you
> > can't use for_each_cpufreq or anything similar at runtime in sched softirq
>
> I'm happy to make things more generic so more users can benefit :-)
>
> >
> > >
> > > >
> > > > uclamp_min should not be used to set an over utilized cpu  because it
> > >
> > > uclamp_min must set overutilized. If a long running task has its uclamp_min
> > > changed, we want to upmigrate it if that's necessary to meet the new demand.
> >
> > I think that you are too much focused on your 8 cores android system.
>
> I am not; you're making an assumption here :-)
>
> HMP systems for 1k servers just don't make any sense. A desktop with 128 or
> even 256 HMP cores is a big stretch; and if that exist I don't think there's an
> overhead to worry about here; and I *did* consider this. I measured the impact
> if we have 128 and it was mere 1 or 2 us extra.  And that's on under powered
> pine book pro. If such a system exist it'd probably be more performant.
>
> > uclamp_min must not set a CPU overutilized because the CPU is not overutilized
> > in this case. It's only the task that is misfit. You mostly try to bias some
> > behavior to fit your use case.
>
> Maybe we are talking about different things over here. As long as we agree it's
> a misfit task then we are aligned.
>
> As far as I know misfit required overutilized to re-enable load balance. But
> maybe there's a detail that's creating this confusion.

The current implementation uses overutilized to fix misfit task
placement because it was assumed that misfit task also means that the
cpu is overutilized as only the utilization was used but It's not
mandatory as proposed below and there is no direct relation anymore
with uclamp_min

>
> >
> > >
> > > Keep in mind in android tasks could move between top-app, foreground and
> > > background groups.
> > >
> > > And keep in mind Android now implement a framework to dynamically change
> > > uclamp. It's available from Android 12 as part of Android Dynamic Performance
> > > Framework (ADPF). It's under CPU Hints.
> > >
> > >     https://developer.android.com/games/optimize/adpf
> > >
> >
> > I have never put any kind of limitation of the task or system behavior
>
> Nope you didn't. I was just trying to highlight some additional problems we're
> seeing that we'd need to consider :-)

I don't see any additional problem TBH.

>
> >
> > > > doesn means that the cpu is overutilized and uclamp_max should be used
> > >
> > > It is a misfit task; which requires overutilized to be set to re-enable load
> > > balance for uclamp_min to upgrate it. For uclamp max we should not set
> > > overutilized, agreed and that's what we should be doing.
> >
> > That's probably the root of your problem here. The load balance is still
> > periodically called even when EAS is enabled but the latter prevents task
> > migration unless overutilized in order to not break the energy aware task
>
> Okay. For me this means load_balance is disabled since it's effectively doing
> nothing. So maybe it's a terminology problem of what I meant with load balance
> is disabled.

This is not doing nothing and your patch is using load_balance to
detect inversion as an example

>
> > placement. But if a task is misplaced and a cpu finds it can help,
> > we should let it pull the task without disabling EAS. This will not enable the
> > performance spread behavior and we can expect the other small tasks to be
> > packed by EAS on the best cpu at next wakeup.
> >
> > So intead of trying to detect a very specific capacity inversion use case
> > during this periodic this load balance, it's better to let a CPU that can
> > fix the misfit situation to pull the task.
>
> I can't see the relation here.
>
> Capacity inversion is required NOT in load balance. In every place will look

You are using periodic load_balance to detect capacity inversion:
update_cpu_capacity() is called during periodic load_balance. Instead
of detecting capacity inversion, use load_balance to pull a misfit
task.
Either there is a cpu on which the task can fit better and it will
pull it or there is no better place and it will not

> for a fitting CPU; we need to ensure hints to place the task on big cpu works
> even under thermal pressure.
>
> If uclamp_min = 1024, but big core has 5% thermal pressure, it should still fit
> there. And feec() a long with all other callers should respect that too.

That's probably where I disagree, task should probably stay on this
cpu because there is no better option (ie cpu with higher compute
capacity) but the task doesn't fit on the cpu because its capacity is
below uclamp_min. You might think that I niptick there but that makes
a difference in the way to handle the situation.

>
> How better overutilized detection helps here? EAS needs to consider this as
> a candidate CPU and place it there at wake up.

If uclamp_min = 1024, but big core has 5% thermal pressure, big cpu
doesn't fit, otherwise uclamp_min is meaningless. But it can still
remain the best choice and feec should select it.

>
> >
> > See below
> >
> > >
> > > > for not setting overutilized a cpu with a clamped max value
> > > >
> > > > >
> > > > > I'm all ears if there's a simpler way to address the problem :-)
> > > >
> > > > let me try to prepare a something to show what I mean
> > >
> > > A PoC just to help me see what you mean would be great, thanks!
> > >
> >
> > I have reverted patches:
> > Revert "sched/fair: Detect capacity inversion"
> > Revert "sched/fair: Consider capacity inversion in util_fits_cpu()"
> > Revert "sched/uclamp: Make cpu_overutilized() use util_fits_cpu()"
> >
> > ---
> >  kernel/sched/fair.c | 32 +++++++++++++++++---------------
> >  1 file changed, 17 insertions(+), 15 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 4423681baf15..6e54afc0a6ec 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4539,7 +4539,7 @@ static inline int util_fits_cpu(unsigned long util,
> >        * the time.
> >        */
> >       capacity_orig = capacity_orig_of(cpu);
> > -     capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > +     capacity_orig_thermal = capacity_orig - thermal_load_avg(cpu_rq(cpu));
> >
> >       /*
> >        * We want to force a task to fit a cpu as implied by uclamp_max.
> > @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> >        *     2. The system is being saturated when we're operating near
> >        *        max capacity, it doesn't make sense to block overutilized.
> >        */
> > -     uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > -     uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> > +     uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> >       fits = fits || uclamp_max_fits;
> >
> >       /*
> > @@ -4614,7 +4613,7 @@ static inline int util_fits_cpu(unsigned long util,
> >        * handle the case uclamp_min > uclamp_max.
> >        */
> >       uclamp_min = min(uclamp_min, uclamp_max);
> > -     if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > +     if (util < uclamp_min)
> >               fits = fits && (uclamp_min <= capacity_orig_thermal);
> >
> >       return fits;
> > @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> >  #ifdef CONFIG_SMP
> >  static inline bool cpu_overutilized(int cpu)
> >  {
> > -     return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> > +     unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > +
> > +     return !(fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)) ||
> > +              rq_util_max <= capacity_orig_of(cpu));
> >  }
> >
> >  static inline void update_overutilized_status(struct rq *rq)
> > @@ -10164,24 +10166,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >        */
> >       update_sd_lb_stats(env, &sds);
> >
> > -     if (sched_energy_enabled()) {
> > -             struct root_domain *rd = env->dst_rq->rd;
> > -
> > -             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > -                     goto out_balanced;
> > -     }
> > -
> > -     local = &sds.local_stat;
> > -     busiest = &sds.busiest_stat;
> > -
> >       /* There is no busy sibling group to pull tasks from */
> >       if (!sds.busiest)
> >               goto out_balanced;
> >
> > +     busiest = &sds.busiest_stat;
> > +
> >       /* Misfit tasks should be dealt with regardless of the avg load */
> >       if (busiest->group_type == group_misfit_task)
> >               goto force_balance;
> >
> > +     if (sched_energy_enabled()) {
> > +             struct root_domain *rd = env->dst_rq->rd;
> > +
> > +             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > +                     goto out_balanced;
> > +     }
> > +
> >       /* ASYM feature bypasses nice load balance check */
> >       if (busiest->group_type == group_asym_packing)
> >               goto force_balance;
> > @@ -10194,6 +10195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >       if (busiest->group_type == group_imbalanced)
> >               goto force_balance;
> >
> > +     local = &sds.local_stat;
> >       /*
> >        * If the local group is busier than the selected busiest group
> >        * don't try and pull any tasks.
> > --
> > 2.17.1
> >
> > There are probably some other changes to do here and here like check_misfit_status
> > but this enable another cpu with more capacity to pull a misfit task without
> > disabling EAS.
>
> I need time to digest this. But so far I see this is still irrelevant as the
> problem with the definition of what *fits* a CPU and what doesn't.
>
> On a system where:
>
>         LITTLE=160, MEDIUM=480, BIG=1024
>
> A task
>
>         p->util_avg = 200
>         p->util_min = 1024
>
> Should always be placed on a big core unless the big core is capacity inverted.
> That means most importantly in feec().
>
> How does the above fix that?

Not differently than if BIG is under thermal pressure and its capacity
goes below 480. In this case, MEDIUM is the best CPU and should be
selected. Note that the task still doesn't fit on any cpu but neither
BIG nor MEDIUM are over utilized and there is no need to disable EAS
and spread the tasks everywhere. feec() and select_idle_capacity()
should take care of this case and as a result of yours. I don't see
any difference in the way both cases should be handled whereas
capacity inversion makes a difference.

>
> This decision should be taken coherently in feec(), select_idle_capacity() and
> any place that wants to check if the task fits the cpu or not.
>
> I'm not sure why you're isolating this to the overutilized. It is just one of
> the call sites that needs to check for fitness when uclamp is being used.
>
> >
> > > >
> > > > >
> > > > > The problem is that thermal pressure on big cpu is not important from
> > > > > uclamp perspective until it is in inversion state. It is quite common to have
> > > > > a system where the medium capacity is in 500 range. If the big is under thermal
> > > > > pressure that it drops to 800, then it is still a fitting CPU from uclamp
> > > > > perspective. Keep in mind uclamp_min is useful for tasks whose utilization is
> > > > > small So we need to be selective when thermal pressure is actually helping out
> > > > > or just creating unnecessary problems.
> > > >
> > > > What about medium cores ?  There are often 3 capacity levels now so
> > > > the same can happen between medium and little.
> > >
> > > We do subtract thermal pressure from capacity for the mediums and littles all
> > > the time. So that case is handled, no?
> >
> > you don't need iversion detection for medium vs little ? Why it's acceptable here
> > and not between bg vs medium
>
> Because it's the highest level. You have no option to do better. uclamp_min is
> a hint; if the util_avg is small (ie: it still fits_capacity()); best effort is
> to keep giving the task the performance it needs. While on a big core; you
> can't upmigrate to any other level; so thermal pressure on big core is
> meaningless until it is in capacity inversion.

>
> If the user asks for something and finds that their task is placed randomly
> that's not good, no?

nobody says to place the task randomly.

>
> What you say makes sense against util_avg; the rule of which is still the same
> as before.
>
> >
> > >>
> > >
> > > >
> > > > Also we have more and more thermal capping happening ahead of
> > > > overheating like the power cap interface where we want to cap some
> > > > CPUs and others even before they reach their thermal limit in order to
> > > > get more power/thermal room for others. This implies that some CPUs
> > > > could be under thermal pressure whereas others not. This implies also
> > > > that a UC with big core capacity being below little and medium cores
> > > > is not impossible or medium being under little but big staying at high
> > > > capacity.
> > >
> > > I *think* I am catering for this already. The big is special because it's the
> > > highest level, so thermal pressure; from uclamp perspective, is not meaningful
> > > as it's the best you can ever get on this system; untill it becomes capacity
> > > inverted that is.
> > >
> >
> > That's another point that concerns me, you put some special condition on big
> > cores to be able to detect an possible inversion.
> > If a task deosn't fit a cpu because of thermal pressure, it should be tagged misfit
> > and let other cpu looks if they can do better. That's doesnt mean that we
> > will find one.
>
> This breaks the usefulness of the hint. What you say means a small task that
> wants to run as fast as possible will find itself placed randomly instead

Why randomly ? feec() and others should continue to select the best
cpu even if it may not fit util_min constraint, whatever the cpu BIG,
MEDIUM or LITTLE

> whenever there's thermal pressure. And userspace should just figure this out
> on their own somehow that thermal pressure affects them and they start to find
> a way to read the thermal pressure and set the right uclamp_min value??
>
> As I tried to highlight; this will be set dynamically based on perceived
> performance. Even for a static configuration; I don't think a hint of
> uclamp_min = 1024 should fail that easily because some cores can be very big
> and always have a residual 1 or 2 % thermal pressure.
>
> [...]
>
> > >
> > > I see.
> > >
> > > Just to make sure, this is only used in LB only, right?
> >
> > Yes, although I haven't check the IPC patchset
> >
> > >
> > > I do have a problem that might make this more complicated.
> > >
> > > I'm considering if we can enable packing in load_balance by using feec().
> > > Limiting the search space to only idle CPUs causes problems. And these problems
> > > relate to both performance and power.
> > >
> > > For example, If a small task is running on a big CPU, it will prevent big-ish
> > > tasks from migrating there on load balance if the only idle cpu is a little
> > > one.
> > >
> > > And if a small task moves to a big cpu on load balance because it's the only
> > > idle one, it could take the cluster out of deep sleep while it could have
> >
> > Maybe your root problem is that the small task migrates on a big cpu only
> > because it's the only idle one and not because of utilization (uclamp_min/max
> > included)
>
> Yes I think that's the reason too and that's why I'm suggesting maybe we need
> to consider packing for systems that care about power.
>
> >
> > > easily crammed on a little core.
> > >
> > > I can't say I've debugged these problems. But we do get lots of issues related
> > > to big cpus waking up unnecessarily (it could be per-cpu tasks too); and task
> > > not migrating fast enough.
> > >
> > > I think the load balance behavior of going with idle cpus first is
> > > contributing. I hope to dig more into this problem in the near future.
> >
> > the load balance is not going with idle cpu 1st, the load balance happens on
> > all cpus but the period of busy cpus is higher to not impact much the running
> > load whereas idle cpus don't have anything else to do except lokking for work
> > to pull and saving power
>
> Okay the impact is the same. We pick idle cpus first; and
> select_idle_capacity() which is what we should go through for HMP only search
> for CPUs that are idle.
>
> >
> > >
> > > Can we consider these problems along the way too?
> > >
> > > > They both try to find the cpu with highest capacity that will fit the
> > > > task requirement
> > >
> > > find the *idl* cpu with highest capacity, right? IIUC, we try to find idle cpu
> > > then use load if all is busy? Sorry I don't have enough of the load balance
> > > code in my head yet..
> > >
> > > >
> > > > >
> > > > > To be honest, I personally think EAS can be useful on SMP systems and it would
> > > > > be nice to enable it outside of sched_asym_cpucapacity.
> > > >
> > > > or even before that task_fits_cpu should probably be useful for some smp case
> > >
> > > I think that should we doable even with the current approach if inversion
> > > detection is not actually the problem and just the way we do it.
> > >
> > > I chose the current approach because it seemed the simpler to me. I can do
> > > something in the topology code to help make the inversion detection much easier
> > > and more generic. But I'll hold on a bit in case you have a thought on another
> > > way to tackle the problem.
> > >
> > > We do have actually a similar problem for RT that I need to tackle too. So
> > > maybe moving it to topology will help both CFS and RT.
> > >
> > > >
> > > > >
> > > > > I'm interested to hear more about this unification idea actually. If you feel
> > > > > a bit chatty to describe in more detail how do you see this being unified, that
> > > > > could be enlightening for some of us who work in this area :-)
> > > >
> > > > First, keep everything scalable and don't loop on all cpufreq policy
> > > > or anything else that can't scale so we don't have to take care of the
> > > > number of cpu of cpufreq policy in the system
> > > > Then, I think that a good starting point would be to merge the
> > >
> > > If your issue just with that, and not inversion detection in particular,
> >
> > Looping on all cpufreq policies is the main problem but i also think that capacity
> > inversion is specific to your system and can be replaced by a generic solution
> > that will not make any assumption on any CPUs big vs little or
> > shared vs per cpu dvfs
>
> It is not specific to 'my' system. It is a generic problem to any HMP based
> system. I can understand issues with the implementation. But I can't
> understand how you think taking thermal pressure unconditionally isn't
> a problem. This unconditional consideration of thermal pressure is my problem.
>
>
> Cheers
>
> --
> Qais Yousef
>
> >
> >
> > > I think I can find an alternative approach. Though for education purposes I'd
> > > like to make sure to understand how/when this scalability problem manifests.
> > >
> > > > behavior of SD_ASYM_CPUCAPACITY and SD_ASYM_PACKING instead of 2
> > > > parallel and similar behaviors
> > >
> > > To be clear, you mean load balance path right? If yes, then I'd like us to
> > > consider how we can cater for some of the potential issues I mentioned above.
> > >
> > > > The misfit task could also be unified to integrate other things than
> > > > EAS like the IPC that ricardo is working one. I haven't look in
> > > > details how this could be doable
> > >
> > > To make things more complicated, to increase the effectiveness of uclamp_max,
> > > we need a 'reverse misfit' behavior (terminology credit to Morten).
> > >
> > > If a busy long running task has its uclamp_max changed such that we can put it
> > > on a more energy efficient core, we want to extend misfit to handle down
> > > migrations too. We relied on tasks going to sleep and feec() automatically
> > > doing the right thing next wake up. With uclamp_max the next wake up might not
> > > happen for a while.
> > >
> > >
> > > Thanks!!
> > >
> > > --
> > > Qais Yousef
Vincent Guittot Dec. 15, 2022, 5:46 p.m. UTC | #15
On Tue, 13 Dec 2022 at 18:38, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> On 12/12/2022 19:43, Qais Yousef wrote:
> > On 12/09/22 17:47, Vincent Guittot wrote:
>
> [...]
>
> > HMP systems for 1k servers just don't make any sense. A desktop with 128 or
> > even 256 HMP cores is a big stretch; and if that exist I don't think there's an
> > overhead to worry about here; and I *did* consider this. I measured the impact
> > if we have 128 and it was mere 1 or 2 us extra.  And that's on under powered
> > pine book pro. If such a system exist it'd probably be more performant.
> >
> >> uclamp_min must not set a CPU overutilized because the CPU is not overutilized
> >> in this case. It's only the task that is misfit. You mostly try to bias some
> >> behavior to fit your use case.
> >
> > Maybe we are talking about different things over here. As long as we agree it's
> > a misfit task then we are aligned.
>
> IMHO, utilization is about the running task and uclamp is maintained
> taking the runnable tasks into consideration as well. Maybe that's the
> source of the different views here?
>
> > As far as I know misfit required overutilized to re-enable load balance. But
> > maybe there's a detail that's creating this confusion.
>
> I think that Vincent is suggesting to let MF load balance happening even
> in !OverUtilized (OU). We gather the necessary load-balance statistics
> already today in !OU so it is easily to do.
>
> >>>> doesn means that the cpu is overutilized and uclamp_max should be used
> >>>
> >>> It is a misfit task; which requires overutilized to be set to re-enable load
> >>> balance for uclamp_min to upgrate it. For uclamp max we should not set
> >>> overutilized, agreed and that's what we should be doing.
> >>
> >> That's probably the root of your problem here. The load balance is still
> >> periodically called even when EAS is enabled but the latter prevents task
> >> migration unless overutilized in order to not break the energy aware task
> >
> > Okay. For me this means load_balance is disabled since it's effectively doing
> > nothing. So maybe it's a terminology problem of what I meant with load balance
> > is disabled.
> >
> >> placement. But if a task is misplaced and a cpu finds it can help,
> >> we should let it pull the task without disabling EAS. This will not enable the
> >> performance spread behavior and we can expect the other small tasks to be
> >> packed by EAS on the best cpu at next wakeup.
> >>
> >> So intead of trying to detect a very specific capacity inversion use case
> >> during this periodic this load balance, it's better to let a CPU that can
> >> fix the misfit situation to pull the task.
> >
> > I can't see the relation here.
>
> To me it looks like that there are 2 stories here:
>
> (A) Why do we need `CPU capacity inversion` (CapInv)? What do we gain
>     by changing `cap = capacity_orig - arch_scale_thermal_pressure()`
>     (i) to `cap = capacity_orig - thermal_load_avg()` (ii)? Do we want
>     to avoid a CPU as a possible target in cases where pressure is
>     already back to 0 and we still have a thermal PELT signal > 0
>     because of it's historic information? But this is then only
>     considering the decaying side of the thermal PELT signal.
>
>     Vincent replaces (i) by (ii) so no need to switch to (ii) for
>     CapInv.
>
> (B) Don't use util_fits_cpu() in cpu_overutilized() and replace this by
>     MF load balance in !OU.
>
> [...]
>
> >> I have reverted patches:
> >> Revert "sched/fair: Detect capacity inversion"
> >> Revert "sched/fair: Consider capacity inversion in util_fits_cpu()"
> >> Revert "sched/uclamp: Make cpu_overutilized() use util_fits_cpu()"
> >>
> >> ---
> >>  kernel/sched/fair.c | 32 +++++++++++++++++---------------
> >>  1 file changed, 17 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 4423681baf15..6e54afc0a6ec 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -4539,7 +4539,7 @@ static inline int util_fits_cpu(unsigned long util,
> >>       * the time.
> >>       */
> >>      capacity_orig = capacity_orig_of(cpu);
> >> -    capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> >> +    capacity_orig_thermal = capacity_orig - thermal_load_avg(cpu_rq(cpu));
> >>
> >>      /*
> >>       * We want to force a task to fit a cpu as implied by uclamp_max.
> >> @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> >>       *     2. The system is being saturated when we're operating near
> >>       *        max capacity, it doesn't make sense to block overutilized.
> >>       */
> >> -    uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> >> -    uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> >> +    uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> >>      fits = fits || uclamp_max_fits;
> >>
> >>      /*
> >> @@ -4614,7 +4613,7 @@ static inline int util_fits_cpu(unsigned long util,
> >>       * handle the case uclamp_min > uclamp_max.
> >>       */
> >>      uclamp_min = min(uclamp_min, uclamp_max);
> >> -    if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> >> +    if (util < uclamp_min)
> >>              fits = fits && (uclamp_min <= capacity_orig_thermal);
> >>
> >>      return fits;
> >> @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> >>  #ifdef CONFIG_SMP
> >>  static inline bool cpu_overutilized(int cpu)
> >>  {
> >> -    return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> >> +    unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> >> +
> >> +    return !(fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)) ||
> >> +             rq_util_max <= capacity_orig_of(cpu));
> >>  }
> >>
> >>  static inline void update_overutilized_status(struct rq *rq)
> >> @@ -10164,24 +10166,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >>       */
> >>      update_sd_lb_stats(env, &sds);
> >>
> >> -    if (sched_energy_enabled()) {
> >> -            struct root_domain *rd = env->dst_rq->rd;
> >> -
> >> -            if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> >> -                    goto out_balanced;
> >> -    }
> >> -
> >> -    local = &sds.local_stat;
> >> -    busiest = &sds.busiest_stat;
> >> -
> >>      /* There is no busy sibling group to pull tasks from */
> >>      if (!sds.busiest)
> >>              goto out_balanced;
> >>
> >> +    busiest = &sds.busiest_stat;
> >> +
> >>      /* Misfit tasks should be dealt with regardless of the avg load */
> >>      if (busiest->group_type == group_misfit_task)
> >>              goto force_balance;
> >>
> >> +    if (sched_energy_enabled()) {
> >> +            struct root_domain *rd = env->dst_rq->rd;
> >> +
> >> +            if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> >> +                    goto out_balanced;
> >> +    }
> >> +
> >>      /* ASYM feature bypasses nice load balance check */
> >>      if (busiest->group_type == group_asym_packing)
> >>              goto force_balance;
> >> @@ -10194,6 +10195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >>      if (busiest->group_type == group_imbalanced)
> >>              goto force_balance;
> >>
> >> +    local = &sds.local_stat;
> >>      /*
> >>       * If the local group is busier than the selected busiest group
> >>       * don't try and pull any tasks.
> >> --
> >> 2.17.1
>
> Specific questions regarding this patch proposal so I can fully
> understand the intention:
>
> I see 4 major changes here:
>
> (1) Remove `CPU capacity inversion` (CapInv)
>
> (2) Don't use util_fits_cpu() in cpu_overutilized()
>
> (3) Use `capacity = capacity_orig - thermal_load_avg() (ii)` (and not `-
>     arch_scale_thermal_pressure() (i)` in util_fits_cpu(), i.e. in
>     sis(), feec(), MF handling (update + load balance (lb))
>
> (4) Allow MF lb in !OverUtilized (OU)
>
> (1) CapInv wouldn't be needed in case only (ii) is used (so we don't
> have to switch from (i) to (ii) (because of 3)?
>
> (2) is there since we don't have to raise OU anymore when `rq_util_min >
> capacity_orig_thermal`. This is replaced by MF handling in !OU lb (4)?

yes and also because a task can misfit on a cpu because of uclamp_min
but the cpu is not overloaded

>
> [...]
Qais Yousef Dec. 20, 2022, 11:51 a.m. UTC | #16
On 12/13/22 17:42, Lukasz Luba wrote:
> Hi Qais,
> 
> I thought I could help with this issue.

Thanks Lukasz!

> 
> On 12/12/22 18:43, Qais Yousef wrote:
> > On 12/09/22 17:47, Vincent Guittot wrote:
> > 
> > [...]
> > 
> > > > > > > This patch loops on all cpufreq policy in sched softirq, how can this
> > > > > > > be sane ? and not only in eas mode but also in the default asymmetric
> > > > > > 
> > > > > > Hmm I'm still puzzled. Why it's not sane to do it here but it's okay to do it
> > > > > > in the wake up path in feec()?
> > > > > 
> > > > > feec() should be considered as an exception not as the default rule.
> > > > > Thing like above which loops for_each on external subsystem should be
> > > > > prevented and the fact that feec loops all PDs doesn't means that we
> > > > > can put that everywhere else
> > > > 
> > > > Fair enough. But really understanding the root cause behind this limitation
> > > > will be very helpful. I don't have the same appreciation of why this is
> > > > a problem, and shedding more light will help me to think more about it in the
> > > > future.
> > > > 
> > > 
> > > Take the example of 1k cores with per cpu policy. Do you really think a
> > > for_each_cpufreq_policy would be reasonable ?
> > 
> > Hmm I don't think such an HMP system makes sense to ever exist.
> > 
> > That system has to be a multi-socket system and I doubt inversion detection is
> > something of value.
> > 
> > Point taken anyway. Let's find another way to do this.
> > 
> 
> Another way might be to use the 'update' code path, which sets this
> information source, for the thermal pressure. That code path isn't as
> hot as this in the task scheduler. Furthermore, we would also
> have time and handle properly CPU hotplug callbacks there.
> 
> So something like this, I have in mind:
> 
> ------------------------------8<-----------------------------
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index e7d6e6657ffa..7f372a93e21b 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -16,6 +16,7 @@
>  #include <linux/sched/topology.h>
>  #include <linux/cpuset.h>
>  #include <linux/cpumask.h>
> +#include <linux/mutex.h>
>  #include <linux/init.h>
>  #include <linux/rcupdate.h>
>  #include <linux/sched.h>
> @@ -153,6 +154,33 @@ void topology_set_freq_scale(const struct cpumask
> *cpus, unsigned long cur_freq,
>  DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
>  DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
>  EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale);
> 
> +static struct cpumask highest_capacity_mask;
> 
> +static struct cpumask highest_capacity_mask;
> +static unsigned int max_possible_capacity;
> +static DEFINE_MUTEX(max_capacity_lock);
> +
> +static void max_capacity_update(const struct cpumask *cpus,
> +                               unsigned long capacity)
> +{
> +       mutex_lock(&max_capacity_lock);
> +
> +       if (max_possible_capacity < capacity) {
> +               max_possible_capacity = capacity;
> +
> +               cpumask_clear(&highest_capacity_mask);
> +
> +               cpumask_or(&highest_capacity_mask,
> +                          &highest_capacity_mask, cpus);
> +       }
> +
> +       mutex_unlock(&max_capacity_lock);
> +}
> +
> +bool topology_test_max_cpu_capacity(unsigned int cpu)
> +{
> +       return cpumask_test_cpu(cpu, &highest_capacity_mask);
> +}
> +EXPORT_SYMBOL_GPL(topology_test_max_cpu_capacity);
> +
>  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
>  {
>         per_cpu(cpu_scale, cpu) = capacity;
> @@ -203,6 +231,8 @@ void topology_update_thermal_pressure(const struct
> cpumask *cpus,
> 
>         for_each_cpu(cpu, cpus)
>                 WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
> +
> +       max_capacity_update(cpus, capacity);
>  }
>  EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
> 
> 
> --------------------------->8--------------------------------
> 
> We could use the RCU if there is a potential to read racy date
> while the updater modifies the mask in the meantime. Mutex is to
> serialize the thermal writers which might be kicked for two
> policies at the same time.
> 
> If you like I can develop and test such code in the arch_topology.c

As we discussed offline, Vincent is keen on decoupling the util_fits_cpu()
logic from HMP - which means I need to reword this differently.

Let's keep this in the back burner in case we need to revisit it again.

Appreciate the proposal!!


Many thanks

--
Qais Yousef
Lukasz Luba Dec. 20, 2022, 12:52 p.m. UTC | #17
On 12/20/22 11:51, Qais Yousef wrote:
> On 12/13/22 17:42, Lukasz Luba wrote:
>> Hi Qais,
>>
>> I thought I could help with this issue.
> 
> Thanks Lukasz!
> 
>>
>> On 12/12/22 18:43, Qais Yousef wrote:
>>> On 12/09/22 17:47, Vincent Guittot wrote:
>>>
>>> [...]
>>>

[snip]

>>
>> If you like I can develop and test such code in the arch_topology.c
> 
> As we discussed offline, Vincent is keen on decoupling the util_fits_cpu()
> logic from HMP - which means I need to reword this differently.

Fingers crossed!

> 
> Let's keep this in the back burner in case we need to revisit it again.
> 
> Appreciate the proposal!!

No worries!

Cheers,
Lukasz

> 
> 
> Many thanks
> 
> --
> Qais Yousef
Qais Yousef Dec. 23, 2022, 11:58 a.m. UTC | #18
On 12/20/22 14:50, Vincent Guittot wrote:

Thanks for the patch!

> Hereafter is what I came with in order to decouple misfit task with cpu
> overutilized. We keep using util_fits_cpu but with 3 values so we can keep
> using it with cpu_overutilized but exclude the case of misfit task
> because uclmap_min. Also select_idle_capacity() and feec() keep selecting the
> big cpu even if it doesn't fit only because of uclamp_min
> 
> 
> Subject: [PATCH] sched/fair: unlink misfit task from cpu overutilized
> 
> By taking into account uclamp_min, the 1:1 relation between task misfit and
> cpu overutilized is no more true as a task with a util_avg of 20as an
> example may not fit a 1024 capacity cpu because of a uclamp_min constraint.
> 
> Add a new state in util_fits_cpu() to reflect the case that task would fit
> a CPU except for the uclamp_min hint which is a bandwidth requriement.

nit: mixing uclamp with bandwidth has been a source of a lot of confusion when
discussing uclamp. Can we use performance requirement instead please?

> 
> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> can use this new value to take additional action to select the best cpu
> that doesn't match uclamp_min.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++---------------
>  1 file changed, 49 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4423681baf15..705335d6af65 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
>  	 *     2. The system is being saturated when we're operating near
>  	 *        max capacity, it doesn't make sense to block overutilized.
>  	 */
> -	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> -	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> +	uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
>  	fits = fits || uclamp_max_fits;
>  
>  	/*
> @@ -4614,8 +4613,8 @@ static inline int util_fits_cpu(unsigned long util,
>  	 * handle the case uclamp_min > uclamp_max.
>  	 */
>  	uclamp_min = min(uclamp_min, uclamp_max);
> -	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> -		fits = fits && (uclamp_min <= capacity_orig_thermal);
> +	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> +		return -1;
>  
>  	return fits;

nit: return !!fits?

We check explicitly == 1 below and I'm not sure all the boolean check above
will guarantee we will end up return 1 for true on all combination of
compilerls/archs.

>  }
> @@ -4625,7 +4624,7 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
>  	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
>  	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
>  	unsigned long util = task_util_est(p);
> -	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> +	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) == 1);

Or make this >  0?

>  }
>  
>  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
>  #ifdef CONFIG_SMP
>  static inline bool cpu_overutilized(int cpu)
>  {
> -	return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> +	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> +	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> +
> +	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
>  }
>  
>  static inline void update_overutilized_status(struct rq *rq)
> @@ -6857,6 +6859,7 @@ static int
>  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  {
>  	unsigned long task_util, util_min, util_max, best_cap = 0;
> +	int fits, best_fits = -1;
>  	int cpu, best_cpu = -1;
>  	struct cpumask *cpus;
>  
> @@ -6872,12 +6875,24 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>  
>  		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
>  			continue;
> -		if (util_fits_cpu(task_util, util_min, util_max, cpu))
> +
> +		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> +
> +		/* This cpu fits with all capacity requirements */

nit: s#capacity#capacity & performance#?

> +		if (fits > 0)
>  			return cpu;
> +		/*
> +		 * Only the min bandwidth (i.e. uclamp_min) doesn't fit. Look
> +		 * for the cpu with highest bandwidth capacity.
> +		 */

s/bandwidth/performance/?

> +		else if (fits < 0)
> +			cpu_cap = capacity_of(cpu) - thermal_load_avg(cpu_rq(cpu));

Hmm. Isn't capacity_of() already takes into account thermal_load_avg()?

Did you mean capacity_orig_of()?

>  
> -		if (cpu_cap > best_cap) {
> +		if ((fits > best_fits) ||
> +		    ((fits == best_fits) && (cpu_cap > best_cap))) {
>  			best_cap = cpu_cap;
>  			best_cpu = cpu;
> +			best_fits = fits;

I'm not sure if this logic is correct. It's a bit of a mind  bender.

	@iter#0

		fits <= 0
		best_fits <= -1

		if (fits > best_fits) // 0 > -1 => True
			...	// update best_cap if larger
			best_fits <= 0

	@iter#1

		fits <= -1
		best_fits <= 0

		if (fits > best_fits) // -1 > 0 => False

		if (fits == best_fits) // -1 == 0 => False

		// We will never update best_cap for all fits = -1 after
		// encountering the first fits = 0

I think we should reverse the initial values and split the conditions

	int fits, best_fits = 0;

		if ((fits < best_fits)) {
			/* Reset best_cap for first "fits_but" */
			best_cap = cpu_cap;
			best_cpu = cpu;
			best_fits = fits;
		} else if ((fits == best_fits) && (cpu_cap > best_cap))) {
			best_cap = cpu_cap;
			best_cpu = cpu;
		}

Which give us

	@iter#0

		fits <= 0
		best_fits <= 0

		if (fits < best_fits) // 0 < 0 => False

		if (fits == best_fits) // 0 == 0 => True
			...	// update best_cap if larger

	@iter#1

		fits <= -1
		best_fits <= 0

		if (fits < best_fits) // -1 < 0 => True
			...	// reset best_cap to first "fits_but" hit
			best_fits <= -1

	@iter#2

		fits <= 0
		best_fits <= -1

		if (fits < best_fits) // 0 < -1 => False

		if (fits == best_fits) // 0 == -1 => False

		// We should never update best_cap for all fits == 0 now

	@iter#3

		fits <= -1
		best_fits <= -1

		if (fits < best_fits) // -1 < -1 => False

		if (fits == best_fits) // -1 == -1 => True
			...	// update best_cap if larger

		// Only fits = -1 will update best_cap if larger now

Of course any hit with fits = 1 will return the cpu immediately.


>  		}
>  	}
>  
> @@ -6890,7 +6905,7 @@ static inline bool asym_fits_cpu(unsigned long util,
>  				 int cpu)
>  {
>  	if (sched_asym_cpucap_active())
> -		return util_fits_cpu(util, util_min, util_max, cpu);
> +		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
>  
>  	return true;
>  }
> @@ -7257,6 +7272,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
>  	struct root_domain *rd = this_rq()->rd;
>  	int cpu, best_energy_cpu, target = -1;
> +	int prev_fits = -1, best_fits = -1;
>  	struct sched_domain *sd;
>  	struct perf_domain *pd;
>  	struct energy_env eenv;
> @@ -7288,10 +7304,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  		unsigned long cpu_cap, cpu_thermal_cap, util;
>  		unsigned long cur_delta, max_spare_cap = 0;
>  		unsigned long rq_util_min, rq_util_max;
> -		unsigned long util_min, util_max;
> +		unsigned long util_min = 0, util_max = 1024;

Why this change? Are you hitting the same warning reported by Dan?

>  		unsigned long prev_spare_cap = 0;
>  		int max_spare_cap_cpu = -1;
>  		unsigned long base_energy;
> +		int fits, max_fits = -1;
>  
>  		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
>  
> @@ -7344,7 +7361,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  					util_max = max(rq_util_max, p_util_max);
>  				}
>  			}
> -			if (!util_fits_cpu(util, util_min, util_max, cpu))
> +
> +			fits = util_fits_cpu(util, util_min, util_max, cpu);
> +			if (!fits)
>  				continue;
>  
>  			lsub_positive(&cpu_cap, util);
> @@ -7352,7 +7371,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  			if (cpu == prev_cpu) {
>  				/* Always use prev_cpu as a candidate. */
>  				prev_spare_cap = cpu_cap;
> -			} else if (cpu_cap > max_spare_cap) {
> +				prev_fits = fits;
> +			} else if ((fits > max_fits) ||
> +				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
>  				/*
>  				 * Find the CPU with the maximum spare capacity
>  				 * among the remaining CPUs in the performance
> @@ -7360,6 +7381,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  				 */
>  				max_spare_cap = cpu_cap;
>  				max_spare_cap_cpu = cpu;
> +				max_fits = fits;

Should we reset best_delta here?

Because we update max_fits here..

>  			}
>  		}
>  
> @@ -7389,15 +7411,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>  			if (cur_delta < base_energy)
>  				goto unlock;
>  			cur_delta -= base_energy;
> -			if (cur_delta < best_delta) {
> +			if ((fits > max_fits) ||
> +			    ((fits == max_fits) && (cur_delta < best_delta))) {

.. on first first transitions from -1 to 1; this condition will be
skipped if cur_delta is lower than best delta. best_delta here could be the
previous -1 fitting cpu.

We should reset best_delta on first transition then look if we encounter
something with a better delta?


Thanks!

--
Qais Yousef

>  				best_delta = cur_delta;
>  				best_energy_cpu = max_spare_cap_cpu;
> +				best_fits = max_fits;
>  			}
>  		}
>  	}
>  	rcu_read_unlock();
>  
> -	if (best_delta < prev_delta)
> +	if ((best_fits > prev_fits) ||
> +	    ((best_fits == prev_fits) && (best_delta < prev_delta)))
>  		target = best_energy_cpu;
>  
>  	return target;
> @@ -10164,24 +10189,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  	 */
>  	update_sd_lb_stats(env, &sds);
>  
> -	if (sched_energy_enabled()) {
> -		struct root_domain *rd = env->dst_rq->rd;
> -
> -		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> -			goto out_balanced;
> -	}
> -
> -	local = &sds.local_stat;
> -	busiest = &sds.busiest_stat;
> -
>  	/* There is no busy sibling group to pull tasks from */
>  	if (!sds.busiest)
>  		goto out_balanced;
>  
> +	busiest = &sds.busiest_stat;
> +
>  	/* Misfit tasks should be dealt with regardless of the avg load */
>  	if (busiest->group_type == group_misfit_task)
>  		goto force_balance;
>  
> +	if (sched_energy_enabled()) {
> +		struct root_domain *rd = env->dst_rq->rd;
> +
> +		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> +			goto out_balanced;
> +	}
> +
>  	/* ASYM feature bypasses nice load balance check */
>  	if (busiest->group_type == group_asym_packing)
>  		goto force_balance;
> @@ -10194,6 +10218,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
>  	if (busiest->group_type == group_imbalanced)
>  		goto force_balance;
>  
> +	local = &sds.local_stat;
>  	/*
>  	 * If the local group is busier than the selected busiest group
>  	 * don't try and pull any tasks.
> -- 
> 2.17.1
> 
> 
> 
> > 
> > 
> > Thanks!!
> > 
> > --
> > Qais Yousef
Vincent Guittot Dec. 27, 2022, 1:33 p.m. UTC | #19
On Fri, 23 Dec 2022 at 12:58, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 12/20/22 14:50, Vincent Guittot wrote:
>
> Thanks for the patch!
>
> > Hereafter is what I came with in order to decouple misfit task with cpu
> > overutilized. We keep using util_fits_cpu but with 3 values so we can keep
> > using it with cpu_overutilized but exclude the case of misfit task
> > because uclmap_min. Also select_idle_capacity() and feec() keep selecting the
> > big cpu even if it doesn't fit only because of uclamp_min
> >
> >
> > Subject: [PATCH] sched/fair: unlink misfit task from cpu overutilized
> >
> > By taking into account uclamp_min, the 1:1 relation between task misfit and
> > cpu overutilized is no more true as a task with a util_avg of 20as an
> > example may not fit a 1024 capacity cpu because of a uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a bandwidth requriement.
>
> nit: mixing uclamp with bandwidth has been a source of a lot of confusion when
> discussing uclamp. Can we use performance requirement instead please?

ok

>
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best cpu
> > that doesn't match uclamp_min.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >  kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++---------------
> >  1 file changed, 49 insertions(+), 24 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 4423681baf15..705335d6af65 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> >        *     2. The system is being saturated when we're operating near
> >        *        max capacity, it doesn't make sense to block overutilized.
> >        */
> > -     uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > -     uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> > +     uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> >       fits = fits || uclamp_max_fits;
> >
> >       /*
> > @@ -4614,8 +4613,8 @@ static inline int util_fits_cpu(unsigned long util,
> >        * handle the case uclamp_min > uclamp_max.
> >        */
> >       uclamp_min = min(uclamp_min, uclamp_max);
> > -     if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > -             fits = fits && (uclamp_min <= capacity_orig_thermal);
> > +     if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > +             return -1;
> >
> >       return fits;
>
> nit: return !!fits?
>
> We check explicitly == 1 below and I'm not sure all the boolean check above
> will guarantee we will end up return 1 for true on all combination of
> compilerls/archs.
>
> >  }
> > @@ -4625,7 +4624,7 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> >       unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> >       unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> >       unsigned long util = task_util_est(p);
> > -     return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > +     return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) == 1);
>
> Or make this >  0?

yes, will use > 0

>
> >  }
> >
> >  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> >  #ifdef CONFIG_SMP
> >  static inline bool cpu_overutilized(int cpu)
> >  {
> > -     return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> > +     unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > +     unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > +
> > +     return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> >  }
> >
> >  static inline void update_overutilized_status(struct rq *rq)
> > @@ -6857,6 +6859,7 @@ static int
> >  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >  {
> >       unsigned long task_util, util_min, util_max, best_cap = 0;
> > +     int fits, best_fits = -1;
> >       int cpu, best_cpu = -1;
> >       struct cpumask *cpus;
> >
> > @@ -6872,12 +6875,24 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >
> >               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> >                       continue;
> > -             if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > +
> > +             fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > +
> > +             /* This cpu fits with all capacity requirements */
>
> nit: s#capacity#capacity & performance#?
>
> > +             if (fits > 0)
> >                       return cpu;
> > +             /*
> > +              * Only the min bandwidth (i.e. uclamp_min) doesn't fit. Look
> > +              * for the cpu with highest bandwidth capacity.
> > +              */
>
> s/bandwidth/performance/?
>
> > +             else if (fits < 0)
> > +                     cpu_cap = capacity_of(cpu) - thermal_load_avg(cpu_rq(cpu));
>
> Hmm. Isn't capacity_of() already takes into account thermal_load_avg()?
>
> Did you mean capacity_orig_of()?

Yes

>
> >
> > -             if (cpu_cap > best_cap) {
> > +             if ((fits > best_fits) ||
> > +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> >                       best_cap = cpu_cap;
> >                       best_cpu = cpu;
> > +                     best_fits = fits;
>
> I'm not sure if this logic is correct. It's a bit of a mind  bender.
>
>         @iter#0
>
>                 fits <= 0
>                 best_fits <= -1
>
>                 if (fits > best_fits) // 0 > -1 => True
>                         ...     // update best_cap if larger
>                         best_fits <= 0
>
>         @iter#1
>
>                 fits <= -1
>                 best_fits <= 0
>
>                 if (fits > best_fits) // -1 > 0 => False
>
>                 if (fits == best_fits) // -1 == 0 => False
>
>                 // We will never update best_cap for all fits = -1 after
>                 // encountering the first fits = 0
>
> I think we should reverse the initial values and split the conditions

The copy/paste from feec() was too quick. It should be :

+             if ((fits < best_fits) ||
+                 ((fits == best_fits) && (cpu_cap > best_cap))) {

I don't think that the split gives any benefit but makes it more
difficult to read. I will add a comment
/*
* Select the CPU which fits better first (-1 being better than 0).
* Then, select the one with the largest capacity at the same level.
*/

>
>         int fits, best_fits = 0;
>
>                 if ((fits < best_fits)) {
>                         /* Reset best_cap for first "fits_but" */
>                         best_cap = cpu_cap;
>                         best_cpu = cpu;
>                         best_fits = fits;
>                 } else if ((fits == best_fits) && (cpu_cap > best_cap))) {
>                         best_cap = cpu_cap;
>                         best_cpu = cpu;
>                 }
>
> Which give us
>
>         @iter#0
>
>                 fits <= 0
>                 best_fits <= 0
>
>                 if (fits < best_fits) // 0 < 0 => False
>
>                 if (fits == best_fits) // 0 == 0 => True
>                         ...     // update best_cap if larger
>
>         @iter#1
>
>                 fits <= -1
>                 best_fits <= 0
>
>                 if (fits < best_fits) // -1 < 0 => True
>                         ...     // reset best_cap to first "fits_but" hit
>                         best_fits <= -1
>
>         @iter#2
>
>                 fits <= 0
>                 best_fits <= -1
>
>                 if (fits < best_fits) // 0 < -1 => False
>
>                 if (fits == best_fits) // 0 == -1 => False
>
>                 // We should never update best_cap for all fits == 0 now
>
>         @iter#3
>
>                 fits <= -1
>                 best_fits <= -1
>
>                 if (fits < best_fits) // -1 < -1 => False
>
>                 if (fits == best_fits) // -1 == -1 => True
>                         ...     // update best_cap if larger
>
>                 // Only fits = -1 will update best_cap if larger now
>
> Of course any hit with fits = 1 will return the cpu immediately.
>
>
> >               }
> >       }
> >
> > @@ -6890,7 +6905,7 @@ static inline bool asym_fits_cpu(unsigned long util,
> >                                int cpu)
> >  {
> >       if (sched_asym_cpucap_active())
> > -             return util_fits_cpu(util, util_min, util_max, cpu);
> > +             return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> >
> >       return true;
> >  }
> > @@ -7257,6 +7272,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >       unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> >       struct root_domain *rd = this_rq()->rd;
> >       int cpu, best_energy_cpu, target = -1;
> > +     int prev_fits = -1, best_fits = -1;
> >       struct sched_domain *sd;
> >       struct perf_domain *pd;
> >       struct energy_env eenv;
> > @@ -7288,10 +7304,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >               unsigned long cpu_cap, cpu_thermal_cap, util;
> >               unsigned long cur_delta, max_spare_cap = 0;
> >               unsigned long rq_util_min, rq_util_max;
> > -             unsigned long util_min, util_max;
> > +             unsigned long util_min = 0, util_max = 1024;
>
> Why this change? Are you hitting the same warning reported by Dan?

While debugging, I got random util_min|max values passed to
util_fits_cpu(). I agree that this is not a real problem because it
means that !uclamp_is_used() and the values will not be used in
util_fits_cpu() in this case but this is a hidden dependency which
seems a bit weak.

I can probably remove it from this patch as it's out of the scope

>
> >               unsigned long prev_spare_cap = 0;
> >               int max_spare_cap_cpu = -1;
> >               unsigned long base_energy;
> > +             int fits, max_fits = -1;
> >
> >               cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> >
> > @@ -7344,7 +7361,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                                       util_max = max(rq_util_max, p_util_max);
> >                               }
> >                       }
> > -                     if (!util_fits_cpu(util, util_min, util_max, cpu))
> > +
> > +                     fits = util_fits_cpu(util, util_min, util_max, cpu);
> > +                     if (!fits)
> >                               continue;
> >
> >                       lsub_positive(&cpu_cap, util);
> > @@ -7352,7 +7371,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                       if (cpu == prev_cpu) {
> >                               /* Always use prev_cpu as a candidate. */
> >                               prev_spare_cap = cpu_cap;
> > -                     } else if (cpu_cap > max_spare_cap) {
> > +                             prev_fits = fits;
> > +                     } else if ((fits > max_fits) ||
> > +                                ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> >                               /*
> >                                * Find the CPU with the maximum spare capacity
> >                                * among the remaining CPUs in the performance
> > @@ -7360,6 +7381,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                                */
> >                               max_spare_cap = cpu_cap;
> >                               max_spare_cap_cpu = cpu;
> > +                             max_fits = fits;
>
> Should we reset best_delta here?
>
> Because we update max_fits here..
>
> >                       }
> >               }
> >
> > @@ -7389,15 +7411,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >                       if (cur_delta < base_energy)
> >                               goto unlock;
> >                       cur_delta -= base_energy;
> > -                     if (cur_delta < best_delta) {
> > +                     if ((fits > max_fits) ||
> > +                         ((fits == max_fits) && (cur_delta < best_delta))) {
>
> .. on first first transitions from -1 to 1; this condition will be
> skipped if cur_delta is lower than best delta. best_delta here could be the
> previous -1 fitting cpu.

But we want a cpu that fits in priority then the one with the smallest delta.

>
> We should reset best_delta on first transition then look if we encounter
> something with a better delta?

my mistake... This should be

+                     if ((max_fits > best_fits) ||
+                         ((max_fits == best_fits) && (cur_delta <
best_delta))) {

I'm going to prepare a new version

>
>
> Thanks!
>
> --
> Qais Yousef
>
> >                               best_delta = cur_delta;
> >                               best_energy_cpu = max_spare_cap_cpu;
> > +                             best_fits = max_fits;
> >                       }
> >               }
> >       }
> >       rcu_read_unlock();
> >
> > -     if (best_delta < prev_delta)
> > +     if ((best_fits > prev_fits) ||
> > +         ((best_fits == prev_fits) && (best_delta < prev_delta)))
> >               target = best_energy_cpu;
> >
> >       return target;
> > @@ -10164,24 +10189,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >        */
> >       update_sd_lb_stats(env, &sds);
> >
> > -     if (sched_energy_enabled()) {
> > -             struct root_domain *rd = env->dst_rq->rd;
> > -
> > -             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > -                     goto out_balanced;
> > -     }
> > -
> > -     local = &sds.local_stat;
> > -     busiest = &sds.busiest_stat;
> > -
> >       /* There is no busy sibling group to pull tasks from */
> >       if (!sds.busiest)
> >               goto out_balanced;
> >
> > +     busiest = &sds.busiest_stat;
> > +
> >       /* Misfit tasks should be dealt with regardless of the avg load */
> >       if (busiest->group_type == group_misfit_task)
> >               goto force_balance;
> >
> > +     if (sched_energy_enabled()) {
> > +             struct root_domain *rd = env->dst_rq->rd;
> > +
> > +             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > +                     goto out_balanced;
> > +     }
> > +
> >       /* ASYM feature bypasses nice load balance check */
> >       if (busiest->group_type == group_asym_packing)
> >               goto force_balance;
> > @@ -10194,6 +10218,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> >       if (busiest->group_type == group_imbalanced)
> >               goto force_balance;
> >
> > +     local = &sds.local_stat;
> >       /*
> >        * If the local group is busier than the selected busiest group
> >        * don't try and pull any tasks.
> > --
> > 2.17.1
> >
> >
> >
> > >
> > >
> > > Thanks!!
> > >
> > > --
> > > Qais Yousef
Vincent Guittot Dec. 28, 2022, 5:18 p.m. UTC | #20
On Tue, 27 Dec 2022 at 14:33, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> On Fri, 23 Dec 2022 at 12:58, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 12/20/22 14:50, Vincent Guittot wrote:
> >
> > Thanks for the patch!
> >
> > > Hereafter is what I came with in order to decouple misfit task with cpu
> > > overutilized. We keep using util_fits_cpu but with 3 values so we can keep
> > > using it with cpu_overutilized but exclude the case of misfit task
> > > because uclmap_min. Also select_idle_capacity() and feec() keep selecting the
> > > big cpu even if it doesn't fit only because of uclamp_min
> > >
> > >
> > > Subject: [PATCH] sched/fair: unlink misfit task from cpu overutilized
> > >
> > > By taking into account uclamp_min, the 1:1 relation between task misfit and
> > > cpu overutilized is no more true as a task with a util_avg of 20as an
> > > example may not fit a 1024 capacity cpu because of a uclamp_min constraint.
> > >
> > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > a CPU except for the uclamp_min hint which is a bandwidth requriement.
> >
> > nit: mixing uclamp with bandwidth has been a source of a lot of confusion when
> > discussing uclamp. Can we use performance requirement instead please?
>
> ok
>
> >
> > >
> > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > can use this new value to take additional action to select the best cpu
> > > that doesn't match uclamp_min.
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > ---
> > >  kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++---------------
> > >  1 file changed, 49 insertions(+), 24 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 4423681baf15..705335d6af65 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> > >        *     2. The system is being saturated when we're operating near
> > >        *        max capacity, it doesn't make sense to block overutilized.
> > >        */
> > > -     uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > -     uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> > > +     uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> > >       fits = fits || uclamp_max_fits;
> > >
> > >       /*
> > > @@ -4614,8 +4613,8 @@ static inline int util_fits_cpu(unsigned long util,
> > >        * handle the case uclamp_min > uclamp_max.
> > >        */
> > >       uclamp_min = min(uclamp_min, uclamp_max);
> > > -     if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > > -             fits = fits && (uclamp_min <= capacity_orig_thermal);
> > > +     if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > +             return -1;
> > >
> > >       return fits;
> >
> > nit: return !!fits?
> >
> > We check explicitly == 1 below and I'm not sure all the boolean check above
> > will guarantee we will end up return 1 for true on all combination of
> > compilerls/archs.
> >
> > >  }
> > > @@ -4625,7 +4624,7 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > >       unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > >       unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > >       unsigned long util = task_util_est(p);
> > > -     return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > > +     return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) == 1);
> >
> > Or make this >  0?
>
> yes, will use > 0
>
> >
> > >  }
> > >
> > >  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > > @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> > >  #ifdef CONFIG_SMP
> > >  static inline bool cpu_overutilized(int cpu)
> > >  {
> > > -     return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> > > +     unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > > +     unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > > +
> > > +     return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > >  }
> > >
> > >  static inline void update_overutilized_status(struct rq *rq)
> > > @@ -6857,6 +6859,7 @@ static int
> > >  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >  {
> > >       unsigned long task_util, util_min, util_max, best_cap = 0;
> > > +     int fits, best_fits = -1;
> > >       int cpu, best_cpu = -1;
> > >       struct cpumask *cpus;
> > >
> > > @@ -6872,12 +6875,24 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >
> > >               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > >                       continue;
> > > -             if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > > +
> > > +             fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > > +
> > > +             /* This cpu fits with all capacity requirements */
> >
> > nit: s#capacity#capacity & performance#?
> >
> > > +             if (fits > 0)
> > >                       return cpu;
> > > +             /*
> > > +              * Only the min bandwidth (i.e. uclamp_min) doesn't fit. Look
> > > +              * for the cpu with highest bandwidth capacity.
> > > +              */
> >
> > s/bandwidth/performance/?
> >
> > > +             else if (fits < 0)
> > > +                     cpu_cap = capacity_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> >
> > Hmm. Isn't capacity_of() already takes into account thermal_load_avg()?
> >
> > Did you mean capacity_orig_of()?
>
> Yes
>
> >
> > >
> > > -             if (cpu_cap > best_cap) {
> > > +             if ((fits > best_fits) ||
> > > +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> > >                       best_cap = cpu_cap;
> > >                       best_cpu = cpu;
> > > +                     best_fits = fits;
> >
> > I'm not sure if this logic is correct. It's a bit of a mind  bender.
> >
> >         @iter#0
> >
> >                 fits <= 0
> >                 best_fits <= -1
> >
> >                 if (fits > best_fits) // 0 > -1 => True
> >                         ...     // update best_cap if larger
> >                         best_fits <= 0
> >
> >         @iter#1
> >
> >                 fits <= -1
> >                 best_fits <= 0
> >
> >                 if (fits > best_fits) // -1 > 0 => False
> >
> >                 if (fits == best_fits) // -1 == 0 => False
> >
> >                 // We will never update best_cap for all fits = -1 after
> >                 // encountering the first fits = 0
> >
> > I think we should reverse the initial values and split the conditions
>
> The copy/paste from feec() was too quick. It should be :
>
> +             if ((fits < best_fits) ||
> +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
>
> I don't think that the split gives any benefit but makes it more
> difficult to read. I will add a comment
> /*
> * Select the CPU which fits better first (-1 being better than 0).
> * Then, select the one with the largest capacity at the same level.
> */
>
> >
> >         int fits, best_fits = 0;
> >
> >                 if ((fits < best_fits)) {
> >                         /* Reset best_cap for first "fits_but" */
> >                         best_cap = cpu_cap;
> >                         best_cpu = cpu;
> >                         best_fits = fits;
> >                 } else if ((fits == best_fits) && (cpu_cap > best_cap))) {
> >                         best_cap = cpu_cap;
> >                         best_cpu = cpu;
> >                 }
> >
> > Which give us
> >
> >         @iter#0
> >
> >                 fits <= 0
> >                 best_fits <= 0
> >
> >                 if (fits < best_fits) // 0 < 0 => False
> >
> >                 if (fits == best_fits) // 0 == 0 => True
> >                         ...     // update best_cap if larger
> >
> >         @iter#1
> >
> >                 fits <= -1
> >                 best_fits <= 0
> >
> >                 if (fits < best_fits) // -1 < 0 => True
> >                         ...     // reset best_cap to first "fits_but" hit
> >                         best_fits <= -1
> >
> >         @iter#2
> >
> >                 fits <= 0
> >                 best_fits <= -1
> >
> >                 if (fits < best_fits) // 0 < -1 => False
> >
> >                 if (fits == best_fits) // 0 == -1 => False
> >
> >                 // We should never update best_cap for all fits == 0 now
> >
> >         @iter#3
> >
> >                 fits <= -1
> >                 best_fits <= -1
> >
> >                 if (fits < best_fits) // -1 < -1 => False
> >
> >                 if (fits == best_fits) // -1 == -1 => True
> >                         ...     // update best_cap if larger
> >
> >                 // Only fits = -1 will update best_cap if larger now
> >
> > Of course any hit with fits = 1 will return the cpu immediately.
> >
> >
> > >               }
> > >       }
> > >
> > > @@ -6890,7 +6905,7 @@ static inline bool asym_fits_cpu(unsigned long util,
> > >                                int cpu)
> > >  {
> > >       if (sched_asym_cpucap_active())
> > > -             return util_fits_cpu(util, util_min, util_max, cpu);
> > > +             return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > >
> > >       return true;
> > >  }
> > > @@ -7257,6 +7272,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >       unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > >       struct root_domain *rd = this_rq()->rd;
> > >       int cpu, best_energy_cpu, target = -1;
> > > +     int prev_fits = -1, best_fits = -1;
> > >       struct sched_domain *sd;
> > >       struct perf_domain *pd;
> > >       struct energy_env eenv;
> > > @@ -7288,10 +7304,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >               unsigned long cpu_cap, cpu_thermal_cap, util;
> > >               unsigned long cur_delta, max_spare_cap = 0;
> > >               unsigned long rq_util_min, rq_util_max;
> > > -             unsigned long util_min, util_max;
> > > +             unsigned long util_min = 0, util_max = 1024;
> >
> > Why this change? Are you hitting the same warning reported by Dan?
>
> While debugging, I got random util_min|max values passed to
> util_fits_cpu(). I agree that this is not a real problem because it
> means that !uclamp_is_used() and the values will not be used in
> util_fits_cpu() in this case but this is a hidden dependency which
> seems a bit weak.
>
> I can probably remove it from this patch as it's out of the scope
>
> >
> > >               unsigned long prev_spare_cap = 0;
> > >               int max_spare_cap_cpu = -1;
> > >               unsigned long base_energy;
> > > +             int fits, max_fits = -1;
> > >
> > >               cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> > >
> > > @@ -7344,7 +7361,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                                       util_max = max(rq_util_max, p_util_max);
> > >                               }
> > >                       }
> > > -                     if (!util_fits_cpu(util, util_min, util_max, cpu))
> > > +
> > > +                     fits = util_fits_cpu(util, util_min, util_max, cpu);
> > > +                     if (!fits)
> > >                               continue;
> > >
> > >                       lsub_positive(&cpu_cap, util);
> > > @@ -7352,7 +7371,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                       if (cpu == prev_cpu) {
> > >                               /* Always use prev_cpu as a candidate. */
> > >                               prev_spare_cap = cpu_cap;
> > > -                     } else if (cpu_cap > max_spare_cap) {
> > > +                             prev_fits = fits;
> > > +                     } else if ((fits > max_fits) ||
> > > +                                ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > >                               /*
> > >                                * Find the CPU with the maximum spare capacity
> > >                                * among the remaining CPUs in the performance
> > > @@ -7360,6 +7381,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                                */
> > >                               max_spare_cap = cpu_cap;
> > >                               max_spare_cap_cpu = cpu;
> > > +                             max_fits = fits;
> >
> > Should we reset best_delta here?
> >
> > Because we update max_fits here..
> >
> > >                       }
> > >               }
> > >
> > > @@ -7389,15 +7411,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                       if (cur_delta < base_energy)
> > >                               goto unlock;
> > >                       cur_delta -= base_energy;
> > > -                     if (cur_delta < best_delta) {
> > > +                     if ((fits > max_fits) ||
> > > +                         ((fits == max_fits) && (cur_delta < best_delta))) {
> >
> > .. on first first transitions from -1 to 1; this condition will be
> > skipped if cur_delta is lower than best delta. best_delta here could be the
> > previous -1 fitting cpu.
>
> But we want a cpu that fits in priority then the one with the smallest delta.
>
> >
> > We should reset best_delta on first transition then look if we encounter
> > something with a better delta?
>
> my mistake... This should be
>
> +                     if ((max_fits > best_fits) ||
> +                         ((max_fits == best_fits) && (cur_delta <
> best_delta))) {
>
> I'm going to prepare a new version

I just sent a new version here:
https://lore.kernel.org/lkml/20221228165415.3436-1-vincent.guittot@linaro.org/

>
> >
> >
> > Thanks!
> >
> > --
> > Qais Yousef
> >
> > >                               best_delta = cur_delta;
> > >                               best_energy_cpu = max_spare_cap_cpu;
> > > +                             best_fits = max_fits;
> > >                       }
> > >               }
> > >       }
> > >       rcu_read_unlock();
> > >
> > > -     if (best_delta < prev_delta)
> > > +     if ((best_fits > prev_fits) ||
> > > +         ((best_fits == prev_fits) && (best_delta < prev_delta)))
> > >               target = best_energy_cpu;
> > >
> > >       return target;
> > > @@ -10164,24 +10189,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > >        */
> > >       update_sd_lb_stats(env, &sds);
> > >
> > > -     if (sched_energy_enabled()) {
> > > -             struct root_domain *rd = env->dst_rq->rd;
> > > -
> > > -             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > -                     goto out_balanced;
> > > -     }
> > > -
> > > -     local = &sds.local_stat;
> > > -     busiest = &sds.busiest_stat;
> > > -
> > >       /* There is no busy sibling group to pull tasks from */
> > >       if (!sds.busiest)
> > >               goto out_balanced;
> > >
> > > +     busiest = &sds.busiest_stat;
> > > +
> > >       /* Misfit tasks should be dealt with regardless of the avg load */
> > >       if (busiest->group_type == group_misfit_task)
> > >               goto force_balance;
> > >
> > > +     if (sched_energy_enabled()) {
> > > +             struct root_domain *rd = env->dst_rq->rd;
> > > +
> > > +             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > +                     goto out_balanced;
> > > +     }
> > > +
> > >       /* ASYM feature bypasses nice load balance check */
> > >       if (busiest->group_type == group_asym_packing)
> > >               goto force_balance;
> > > @@ -10194,6 +10218,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > >       if (busiest->group_type == group_imbalanced)
> > >               goto force_balance;
> > >
> > > +     local = &sds.local_stat;
> > >       /*
> > >        * If the local group is busier than the selected busiest group
> > >        * don't try and pull any tasks.
> > > --
> > > 2.17.1
> > >
> > >
> > >
> > > >
> > > >
> > > > Thanks!!
> > > >
> > > > --
> > > > Qais Yousef
Qais Yousef Jan. 9, 2023, 4:40 p.m. UTC | #21
On 12/27/22 14:33, Vincent Guittot wrote:

Sorry for late response; it was the holiday season :-)

> On Fri, 23 Dec 2022 at 12:58, Qais Yousef <qyousef@layalina.io> wrote:
> >
> > On 12/20/22 14:50, Vincent Guittot wrote:
> >
> > Thanks for the patch!
> >
> > > Hereafter is what I came with in order to decouple misfit task with cpu
> > > overutilized. We keep using util_fits_cpu but with 3 values so we can keep
> > > using it with cpu_overutilized but exclude the case of misfit task
> > > because uclmap_min. Also select_idle_capacity() and feec() keep selecting the
> > > big cpu even if it doesn't fit only because of uclamp_min
> > >
> > >
> > > Subject: [PATCH] sched/fair: unlink misfit task from cpu overutilized
> > >
> > > By taking into account uclamp_min, the 1:1 relation between task misfit and
> > > cpu overutilized is no more true as a task with a util_avg of 20as an
> > > example may not fit a 1024 capacity cpu because of a uclamp_min constraint.
> > >
> > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > a CPU except for the uclamp_min hint which is a bandwidth requriement.
> >
> > nit: mixing uclamp with bandwidth has been a source of a lot of confusion when
> > discussing uclamp. Can we use performance requirement instead please?
> 
> ok
> 
> >
> > >
> > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > can use this new value to take additional action to select the best cpu
> > > that doesn't match uclamp_min.
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > ---
> > >  kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++---------------
> > >  1 file changed, 49 insertions(+), 24 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 4423681baf15..705335d6af65 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> > >        *     2. The system is being saturated when we're operating near
> > >        *        max capacity, it doesn't make sense to block overutilized.
> > >        */
> > > -     uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > -     uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> > > +     uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> > >       fits = fits || uclamp_max_fits;
> > >
> > >       /*
> > > @@ -4614,8 +4613,8 @@ static inline int util_fits_cpu(unsigned long util,
> > >        * handle the case uclamp_min > uclamp_max.
> > >        */
> > >       uclamp_min = min(uclamp_min, uclamp_max);
> > > -     if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > > -             fits = fits && (uclamp_min <= capacity_orig_thermal);
> > > +     if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > +             return -1;
> > >
> > >       return fits;
> >
> > nit: return !!fits?
> >
> > We check explicitly == 1 below and I'm not sure all the boolean check above
> > will guarantee we will end up return 1 for true on all combination of
> > compilerls/archs.
> >
> > >  }
> > > @@ -4625,7 +4624,7 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > >       unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > >       unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > >       unsigned long util = task_util_est(p);
> > > -     return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > > +     return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) == 1);
> >
> > Or make this >  0?
> 
> yes, will use > 0
> 
> >
> > >  }
> > >
> > >  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > > @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> > >  #ifdef CONFIG_SMP
> > >  static inline bool cpu_overutilized(int cpu)
> > >  {
> > > -     return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> > > +     unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > > +     unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > > +
> > > +     return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > >  }
> > >
> > >  static inline void update_overutilized_status(struct rq *rq)
> > > @@ -6857,6 +6859,7 @@ static int
> > >  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >  {
> > >       unsigned long task_util, util_min, util_max, best_cap = 0;
> > > +     int fits, best_fits = -1;
> > >       int cpu, best_cpu = -1;
> > >       struct cpumask *cpus;
> > >
> > > @@ -6872,12 +6875,24 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >
> > >               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > >                       continue;
> > > -             if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > > +
> > > +             fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > > +
> > > +             /* This cpu fits with all capacity requirements */
> >
> > nit: s#capacity#capacity & performance#?
> >
> > > +             if (fits > 0)
> > >                       return cpu;
> > > +             /*
> > > +              * Only the min bandwidth (i.e. uclamp_min) doesn't fit. Look
> > > +              * for the cpu with highest bandwidth capacity.
> > > +              */
> >
> > s/bandwidth/performance/?
> >
> > > +             else if (fits < 0)
> > > +                     cpu_cap = capacity_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> >
> > Hmm. Isn't capacity_of() already takes into account thermal_load_avg()?
> >
> > Did you mean capacity_orig_of()?
> 
> Yes
> 
> >
> > >
> > > -             if (cpu_cap > best_cap) {
> > > +             if ((fits > best_fits) ||
> > > +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> > >                       best_cap = cpu_cap;
> > >                       best_cpu = cpu;
> > > +                     best_fits = fits;
> >
> > I'm not sure if this logic is correct. It's a bit of a mind  bender.
> >
> >         @iter#0
> >
> >                 fits <= 0
> >                 best_fits <= -1
> >
> >                 if (fits > best_fits) // 0 > -1 => True
> >                         ...     // update best_cap if larger
> >                         best_fits <= 0
> >
> >         @iter#1
> >
> >                 fits <= -1
> >                 best_fits <= 0
> >
> >                 if (fits > best_fits) // -1 > 0 => False
> >
> >                 if (fits == best_fits) // -1 == 0 => False
> >
> >                 // We will never update best_cap for all fits = -1 after
> >                 // encountering the first fits = 0
> >
> > I think we should reverse the initial values and split the conditions
> 
> The copy/paste from feec() was too quick. It should be :
> 
> +             if ((fits < best_fits) ||
> +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> 
> I don't think that the split gives any benefit but makes it more
> difficult to read. I will add a comment
> /*
> * Select the CPU which fits better first (-1 being better than 0).
> * Then, select the one with the largest capacity at the same level.
> */

I think that should work yes. I might have gotten confused; I'll look closely
again in the new version in case I caught something before but I forgot about
now.

> 
> >
> >         int fits, best_fits = 0;
> >
> >                 if ((fits < best_fits)) {
> >                         /* Reset best_cap for first "fits_but" */
> >                         best_cap = cpu_cap;
> >                         best_cpu = cpu;
> >                         best_fits = fits;
> >                 } else if ((fits == best_fits) && (cpu_cap > best_cap))) {
> >                         best_cap = cpu_cap;
> >                         best_cpu = cpu;
> >                 }
> >
> > Which give us
> >
> >         @iter#0
> >
> >                 fits <= 0
> >                 best_fits <= 0
> >
> >                 if (fits < best_fits) // 0 < 0 => False
> >
> >                 if (fits == best_fits) // 0 == 0 => True
> >                         ...     // update best_cap if larger
> >
> >         @iter#1
> >
> >                 fits <= -1
> >                 best_fits <= 0
> >
> >                 if (fits < best_fits) // -1 < 0 => True
> >                         ...     // reset best_cap to first "fits_but" hit
> >                         best_fits <= -1
> >
> >         @iter#2
> >
> >                 fits <= 0
> >                 best_fits <= -1
> >
> >                 if (fits < best_fits) // 0 < -1 => False
> >
> >                 if (fits == best_fits) // 0 == -1 => False
> >
> >                 // We should never update best_cap for all fits == 0 now
> >
> >         @iter#3
> >
> >                 fits <= -1
> >                 best_fits <= -1
> >
> >                 if (fits < best_fits) // -1 < -1 => False
> >
> >                 if (fits == best_fits) // -1 == -1 => True
> >                         ...     // update best_cap if larger
> >
> >                 // Only fits = -1 will update best_cap if larger now
> >
> > Of course any hit with fits = 1 will return the cpu immediately.
> >
> >
> > >               }
> > >       }
> > >
> > > @@ -6890,7 +6905,7 @@ static inline bool asym_fits_cpu(unsigned long util,
> > >                                int cpu)
> > >  {
> > >       if (sched_asym_cpucap_active())
> > > -             return util_fits_cpu(util, util_min, util_max, cpu);
> > > +             return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > >
> > >       return true;
> > >  }
> > > @@ -7257,6 +7272,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >       unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > >       struct root_domain *rd = this_rq()->rd;
> > >       int cpu, best_energy_cpu, target = -1;
> > > +     int prev_fits = -1, best_fits = -1;
> > >       struct sched_domain *sd;
> > >       struct perf_domain *pd;
> > >       struct energy_env eenv;
> > > @@ -7288,10 +7304,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >               unsigned long cpu_cap, cpu_thermal_cap, util;
> > >               unsigned long cur_delta, max_spare_cap = 0;
> > >               unsigned long rq_util_min, rq_util_max;
> > > -             unsigned long util_min, util_max;
> > > +             unsigned long util_min = 0, util_max = 1024;
> >
> > Why this change? Are you hitting the same warning reported by Dan?
> 
> While debugging, I got random util_min|max values passed to
> util_fits_cpu(). I agree that this is not a real problem because it
> means that !uclamp_is_used() and the values will not be used in
> util_fits_cpu() in this case but this is a hidden dependency which
> seems a bit weak.
> 
> I can probably remove it from this patch as it's out of the scope

Patch 1 of this series addresses this already :-)

Talking about this serries; I'm confused what's the plan for patch 2 now?

My understanding was Peter should pick 1 and 2 as fixes until we nail this
patch out.

> 
> >
> > >               unsigned long prev_spare_cap = 0;
> > >               int max_spare_cap_cpu = -1;
> > >               unsigned long base_energy;
> > > +             int fits, max_fits = -1;
> > >
> > >               cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> > >
> > > @@ -7344,7 +7361,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                                       util_max = max(rq_util_max, p_util_max);
> > >                               }
> > >                       }
> > > -                     if (!util_fits_cpu(util, util_min, util_max, cpu))
> > > +
> > > +                     fits = util_fits_cpu(util, util_min, util_max, cpu);
> > > +                     if (!fits)
> > >                               continue;
> > >
> > >                       lsub_positive(&cpu_cap, util);
> > > @@ -7352,7 +7371,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                       if (cpu == prev_cpu) {
> > >                               /* Always use prev_cpu as a candidate. */
> > >                               prev_spare_cap = cpu_cap;
> > > -                     } else if (cpu_cap > max_spare_cap) {
> > > +                             prev_fits = fits;
> > > +                     } else if ((fits > max_fits) ||
> > > +                                ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > >                               /*
> > >                                * Find the CPU with the maximum spare capacity
> > >                                * among the remaining CPUs in the performance
> > > @@ -7360,6 +7381,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                                */
> > >                               max_spare_cap = cpu_cap;
> > >                               max_spare_cap_cpu = cpu;
> > > +                             max_fits = fits;
> >
> > Should we reset best_delta here?
> >
> > Because we update max_fits here..
> >
> > >                       }
> > >               }
> > >
> > > @@ -7389,15 +7411,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >                       if (cur_delta < base_energy)
> > >                               goto unlock;
> > >                       cur_delta -= base_energy;
> > > -                     if (cur_delta < best_delta) {
> > > +                     if ((fits > max_fits) ||
> > > +                         ((fits == max_fits) && (cur_delta < best_delta))) {
> >
> > .. on first first transitions from -1 to 1; this condition will be
> > skipped if cur_delta is lower than best delta. best_delta here could be the
> > previous -1 fitting cpu.
> 
> But we want a cpu that fits in priority then the one with the smallest delta.

Yes; but the smallest delta should be updated when we update the 'priority'.

> 
> >
> > We should reset best_delta on first transition then look if we encounter
> > something with a better delta?
> 
> my mistake... This should be
> 
> +                     if ((max_fits > best_fits) ||
> +                         ((max_fits == best_fits) && (cur_delta <
> best_delta))) {
> 
> I'm going to prepare a new version

Hmm I'll go through this in the new patch.


Thanks!

--
Qais Yousef

> 
> >
> >
> > Thanks!
> >
> > --
> > Qais Yousef
> >
> > >                               best_delta = cur_delta;
> > >                               best_energy_cpu = max_spare_cap_cpu;
> > > +                             best_fits = max_fits;
> > >                       }
> > >               }
> > >       }
> > >       rcu_read_unlock();
> > >
> > > -     if (best_delta < prev_delta)
> > > +     if ((best_fits > prev_fits) ||
> > > +         ((best_fits == prev_fits) && (best_delta < prev_delta)))
> > >               target = best_energy_cpu;
> > >
> > >       return target;
> > > @@ -10164,24 +10189,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > >        */
> > >       update_sd_lb_stats(env, &sds);
> > >
> > > -     if (sched_energy_enabled()) {
> > > -             struct root_domain *rd = env->dst_rq->rd;
> > > -
> > > -             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > -                     goto out_balanced;
> > > -     }
> > > -
> > > -     local = &sds.local_stat;
> > > -     busiest = &sds.busiest_stat;
> > > -
> > >       /* There is no busy sibling group to pull tasks from */
> > >       if (!sds.busiest)
> > >               goto out_balanced;
> > >
> > > +     busiest = &sds.busiest_stat;
> > > +
> > >       /* Misfit tasks should be dealt with regardless of the avg load */
> > >       if (busiest->group_type == group_misfit_task)
> > >               goto force_balance;
> > >
> > > +     if (sched_energy_enabled()) {
> > > +             struct root_domain *rd = env->dst_rq->rd;
> > > +
> > > +             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > +                     goto out_balanced;
> > > +     }
> > > +
> > >       /* ASYM feature bypasses nice load balance check */
> > >       if (busiest->group_type == group_asym_packing)
> > >               goto force_balance;
> > > @@ -10194,6 +10218,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > >       if (busiest->group_type == group_imbalanced)
> > >               goto force_balance;
> > >
> > > +     local = &sds.local_stat;
> > >       /*
> > >        * If the local group is busier than the selected busiest group
> > >        * don't try and pull any tasks.
> > > --
> > > 2.17.1
> > >
> > >
> > >
> > > >
> > > >
> > > > Thanks!!
> > > >
> > > > --
> > > > Qais Yousef
Vincent Guittot Jan. 10, 2023, 4:38 p.m. UTC | #22
On Mon, 9 Jan 2023 at 17:40, Qais Yousef <qyousef@layalina.io> wrote:
>
> On 12/27/22 14:33, Vincent Guittot wrote:
>
> Sorry for late response; it was the holiday season :-)
>
> > On Fri, 23 Dec 2022 at 12:58, Qais Yousef <qyousef@layalina.io> wrote:
> > >
> > > On 12/20/22 14:50, Vincent Guittot wrote:
> > >
> > > Thanks for the patch!
> > >
> > > > Hereafter is what I came with in order to decouple misfit task with cpu
> > > > overutilized. We keep using util_fits_cpu but with 3 values so we can keep
> > > > using it with cpu_overutilized but exclude the case of misfit task
> > > > because uclmap_min. Also select_idle_capacity() and feec() keep selecting the
> > > > big cpu even if it doesn't fit only because of uclamp_min
> > > >
> > > >
> > > > Subject: [PATCH] sched/fair: unlink misfit task from cpu overutilized
> > > >
> > > > By taking into account uclamp_min, the 1:1 relation between task misfit and
> > > > cpu overutilized is no more true as a task with a util_avg of 20as an
> > > > example may not fit a 1024 capacity cpu because of a uclamp_min constraint.
> > > >
> > > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > > a CPU except for the uclamp_min hint which is a bandwidth requriement.
> > >
> > > nit: mixing uclamp with bandwidth has been a source of a lot of confusion when
> > > discussing uclamp. Can we use performance requirement instead please?
> >
> > ok
> >
> > >
> > > >
> > > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > > can use this new value to take additional action to select the best cpu
> > > > that doesn't match uclamp_min.
> > > >
> > > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > > ---
> > > >  kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++---------------
> > > >  1 file changed, 49 insertions(+), 24 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 4423681baf15..705335d6af65 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4578,8 +4578,7 @@ static inline int util_fits_cpu(unsigned long util,
> > > >        *     2. The system is being saturated when we're operating near
> > > >        *        max capacity, it doesn't make sense to block overutilized.
> > > >        */
> > > > -     uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > > -     uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> > > > +     uclamp_max_fits = (uclamp_max <= capacity_orig) || (capacity_orig == SCHED_CAPACITY_SCALE);
> > > >       fits = fits || uclamp_max_fits;
> > > >
> > > >       /*
> > > > @@ -4614,8 +4613,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > >        * handle the case uclamp_min > uclamp_max.
> > > >        */
> > > >       uclamp_min = min(uclamp_min, uclamp_max);
> > > > -     if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > > > -             fits = fits && (uclamp_min <= capacity_orig_thermal);
> > > > +     if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > > +             return -1;
> > > >
> > > >       return fits;
> > >
> > > nit: return !!fits?
> > >
> > > We check explicitly == 1 below and I'm not sure all the boolean check above
> > > will guarantee we will end up return 1 for true on all combination of
> > > compilerls/archs.
> > >
> > > >  }
> > > > @@ -4625,7 +4624,7 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > > >       unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > > >       unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > > >       unsigned long util = task_util_est(p);
> > > > -     return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > > > +     return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) == 1);
> > >
> > > Or make this >  0?
> >
> > yes, will use > 0
> >
> > >
> > > >  }
> > > >
> > > >  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > > > @@ -6064,7 +6063,10 @@ static inline void hrtick_update(struct rq *rq)
> > > >  #ifdef CONFIG_SMP
> > > >  static inline bool cpu_overutilized(int cpu)
> > > >  {
> > > > -     return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
> > > > +     unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > > > +     unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > > > +
> > > > +     return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > > >  }
> > > >
> > > >  static inline void update_overutilized_status(struct rq *rq)
> > > > @@ -6857,6 +6859,7 @@ static int
> > > >  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > >  {
> > > >       unsigned long task_util, util_min, util_max, best_cap = 0;
> > > > +     int fits, best_fits = -1;
> > > >       int cpu, best_cpu = -1;
> > > >       struct cpumask *cpus;
> > > >
> > > > @@ -6872,12 +6875,24 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > >
> > > >               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > > >                       continue;
> > > > -             if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > > > +
> > > > +             fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > > > +
> > > > +             /* This cpu fits with all capacity requirements */
> > >
> > > nit: s#capacity#capacity & performance#?
> > >
> > > > +             if (fits > 0)
> > > >                       return cpu;
> > > > +             /*
> > > > +              * Only the min bandwidth (i.e. uclamp_min) doesn't fit. Look
> > > > +              * for the cpu with highest bandwidth capacity.
> > > > +              */
> > >
> > > s/bandwidth/performance/?
> > >
> > > > +             else if (fits < 0)
> > > > +                     cpu_cap = capacity_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> > >
> > > Hmm. Isn't capacity_of() already takes into account thermal_load_avg()?
> > >
> > > Did you mean capacity_orig_of()?
> >
> > Yes
> >
> > >
> > > >
> > > > -             if (cpu_cap > best_cap) {
> > > > +             if ((fits > best_fits) ||
> > > > +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> > > >                       best_cap = cpu_cap;
> > > >                       best_cpu = cpu;
> > > > +                     best_fits = fits;
> > >
> > > I'm not sure if this logic is correct. It's a bit of a mind  bender.
> > >
> > >         @iter#0
> > >
> > >                 fits <= 0
> > >                 best_fits <= -1
> > >
> > >                 if (fits > best_fits) // 0 > -1 => True
> > >                         ...     // update best_cap if larger
> > >                         best_fits <= 0
> > >
> > >         @iter#1
> > >
> > >                 fits <= -1
> > >                 best_fits <= 0
> > >
> > >                 if (fits > best_fits) // -1 > 0 => False
> > >
> > >                 if (fits == best_fits) // -1 == 0 => False
> > >
> > >                 // We will never update best_cap for all fits = -1 after
> > >                 // encountering the first fits = 0
> > >
> > > I think we should reverse the initial values and split the conditions
> >
> > The copy/paste from feec() was too quick. It should be :
> >
> > +             if ((fits < best_fits) ||
> > +                 ((fits == best_fits) && (cpu_cap > best_cap))) {
> >
> > I don't think that the split gives any benefit but makes it more
> > difficult to read. I will add a comment
> > /*
> > * Select the CPU which fits better first (-1 being better than 0).
> > * Then, select the one with the largest capacity at the same level.
> > */
>
> I think that should work yes. I might have gotten confused; I'll look closely
> again in the new version in case I caught something before but I forgot about
> now.
>
> >
> > >
> > >         int fits, best_fits = 0;
> > >
> > >                 if ((fits < best_fits)) {
> > >                         /* Reset best_cap for first "fits_but" */
> > >                         best_cap = cpu_cap;
> > >                         best_cpu = cpu;
> > >                         best_fits = fits;
> > >                 } else if ((fits == best_fits) && (cpu_cap > best_cap))) {
> > >                         best_cap = cpu_cap;
> > >                         best_cpu = cpu;
> > >                 }
> > >
> > > Which give us
> > >
> > >         @iter#0
> > >
> > >                 fits <= 0
> > >                 best_fits <= 0
> > >
> > >                 if (fits < best_fits) // 0 < 0 => False
> > >
> > >                 if (fits == best_fits) // 0 == 0 => True
> > >                         ...     // update best_cap if larger
> > >
> > >         @iter#1
> > >
> > >                 fits <= -1
> > >                 best_fits <= 0
> > >
> > >                 if (fits < best_fits) // -1 < 0 => True
> > >                         ...     // reset best_cap to first "fits_but" hit
> > >                         best_fits <= -1
> > >
> > >         @iter#2
> > >
> > >                 fits <= 0
> > >                 best_fits <= -1
> > >
> > >                 if (fits < best_fits) // 0 < -1 => False
> > >
> > >                 if (fits == best_fits) // 0 == -1 => False
> > >
> > >                 // We should never update best_cap for all fits == 0 now
> > >
> > >         @iter#3
> > >
> > >                 fits <= -1
> > >                 best_fits <= -1
> > >
> > >                 if (fits < best_fits) // -1 < -1 => False
> > >
> > >                 if (fits == best_fits) // -1 == -1 => True
> > >                         ...     // update best_cap if larger
> > >
> > >                 // Only fits = -1 will update best_cap if larger now
> > >
> > > Of course any hit with fits = 1 will return the cpu immediately.
> > >
> > >
> > > >               }
> > > >       }
> > > >
> > > > @@ -6890,7 +6905,7 @@ static inline bool asym_fits_cpu(unsigned long util,
> > > >                                int cpu)
> > > >  {
> > > >       if (sched_asym_cpucap_active())
> > > > -             return util_fits_cpu(util, util_min, util_max, cpu);
> > > > +             return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > > >
> > > >       return true;
> > > >  }
> > > > @@ -7257,6 +7272,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >       unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > > >       struct root_domain *rd = this_rq()->rd;
> > > >       int cpu, best_energy_cpu, target = -1;
> > > > +     int prev_fits = -1, best_fits = -1;
> > > >       struct sched_domain *sd;
> > > >       struct perf_domain *pd;
> > > >       struct energy_env eenv;
> > > > @@ -7288,10 +7304,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >               unsigned long cpu_cap, cpu_thermal_cap, util;
> > > >               unsigned long cur_delta, max_spare_cap = 0;
> > > >               unsigned long rq_util_min, rq_util_max;
> > > > -             unsigned long util_min, util_max;
> > > > +             unsigned long util_min = 0, util_max = 1024;
> > >
> > > Why this change? Are you hitting the same warning reported by Dan?
> >
> > While debugging, I got random util_min|max values passed to
> > util_fits_cpu(). I agree that this is not a real problem because it
> > means that !uclamp_is_used() and the values will not be used in
> > util_fits_cpu() in this case but this is a hidden dependency which
> > seems a bit weak.
> >
> > I can probably remove it from this patch as it's out of the scope
>
> Patch 1 of this series addresses this already :-)

Ah yes.

>
> Talking about this serries; I'm confused what's the plan for patch 2 now?
>
> My understanding was Peter should pick 1 and 2 as fixes until we nail this
> patch out.

yes patch 1 and 2 should be merged to fix mainline implementation. As
discussed offline, the end goal remains to remove any kind of external
for loop in load balance

>
> >
> > >
> > > >               unsigned long prev_spare_cap = 0;
> > > >               int max_spare_cap_cpu = -1;
> > > >               unsigned long base_energy;
> > > > +             int fits, max_fits = -1;
> > > >
> > > >               cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> > > >
> > > > @@ -7344,7 +7361,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                                       util_max = max(rq_util_max, p_util_max);
> > > >                               }
> > > >                       }
> > > > -                     if (!util_fits_cpu(util, util_min, util_max, cpu))
> > > > +
> > > > +                     fits = util_fits_cpu(util, util_min, util_max, cpu);
> > > > +                     if (!fits)
> > > >                               continue;
> > > >
> > > >                       lsub_positive(&cpu_cap, util);
> > > > @@ -7352,7 +7371,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                       if (cpu == prev_cpu) {
> > > >                               /* Always use prev_cpu as a candidate. */
> > > >                               prev_spare_cap = cpu_cap;
> > > > -                     } else if (cpu_cap > max_spare_cap) {
> > > > +                             prev_fits = fits;
> > > > +                     } else if ((fits > max_fits) ||
> > > > +                                ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > > >                               /*
> > > >                                * Find the CPU with the maximum spare capacity
> > > >                                * among the remaining CPUs in the performance
> > > > @@ -7360,6 +7381,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                                */
> > > >                               max_spare_cap = cpu_cap;
> > > >                               max_spare_cap_cpu = cpu;
> > > > +                             max_fits = fits;
> > >
> > > Should we reset best_delta here?
> > >
> > > Because we update max_fits here..
> > >
> > > >                       }
> > > >               }
> > > >
> > > > @@ -7389,15 +7411,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >                       if (cur_delta < base_energy)
> > > >                               goto unlock;
> > > >                       cur_delta -= base_energy;
> > > > -                     if (cur_delta < best_delta) {
> > > > +                     if ((fits > max_fits) ||
> > > > +                         ((fits == max_fits) && (cur_delta < best_delta))) {
> > >
> > > .. on first first transitions from -1 to 1; this condition will be
> > > skipped if cur_delta is lower than best delta. best_delta here could be the
> > > previous -1 fitting cpu.
> >
> > But we want a cpu that fits in priority then the one with the smallest delta.
>
> Yes; but the smallest delta should be updated when we update the 'priority'.
>
> >
> > >
> > > We should reset best_delta on first transition then look if we encounter
> > > something with a better delta?
> >
> > my mistake... This should be
> >
> > +                     if ((max_fits > best_fits) ||
> > +                         ((max_fits == best_fits) && (cur_delta <
> > best_delta))) {
> >
> > I'm going to prepare a new version
>
> Hmm I'll go through this in the new patch.
>
>
> Thanks!
>
> --
> Qais Yousef
>
> >
> > >
> > >
> > > Thanks!
> > >
> > > --
> > > Qais Yousef
> > >
> > > >                               best_delta = cur_delta;
> > > >                               best_energy_cpu = max_spare_cap_cpu;
> > > > +                             best_fits = max_fits;
> > > >                       }
> > > >               }
> > > >       }
> > > >       rcu_read_unlock();
> > > >
> > > > -     if (best_delta < prev_delta)
> > > > +     if ((best_fits > prev_fits) ||
> > > > +         ((best_fits == prev_fits) && (best_delta < prev_delta)))
> > > >               target = best_energy_cpu;
> > > >
> > > >       return target;
> > > > @@ -10164,24 +10189,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > >        */
> > > >       update_sd_lb_stats(env, &sds);
> > > >
> > > > -     if (sched_energy_enabled()) {
> > > > -             struct root_domain *rd = env->dst_rq->rd;
> > > > -
> > > > -             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > > -                     goto out_balanced;
> > > > -     }
> > > > -
> > > > -     local = &sds.local_stat;
> > > > -     busiest = &sds.busiest_stat;
> > > > -
> > > >       /* There is no busy sibling group to pull tasks from */
> > > >       if (!sds.busiest)
> > > >               goto out_balanced;
> > > >
> > > > +     busiest = &sds.busiest_stat;
> > > > +
> > > >       /* Misfit tasks should be dealt with regardless of the avg load */
> > > >       if (busiest->group_type == group_misfit_task)
> > > >               goto force_balance;
> > > >
> > > > +     if (sched_energy_enabled()) {
> > > > +             struct root_domain *rd = env->dst_rq->rd;
> > > > +
> > > > +             if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > > +                     goto out_balanced;
> > > > +     }
> > > > +
> > > >       /* ASYM feature bypasses nice load balance check */
> > > >       if (busiest->group_type == group_asym_packing)
> > > >               goto force_balance;
> > > > @@ -10194,6 +10218,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > >       if (busiest->group_type == group_imbalanced)
> > > >               goto force_balance;
> > > >
> > > > +     local = &sds.local_stat;
> > > >       /*
> > > >        * If the local group is busier than the selected busiest group
> > > >        * don't try and pull any tasks.
> > > > --
> > > > 2.17.1
> > > >
> > > >
> > > >
> > > > >
> > > > >
> > > > > Thanks!!
> > > > >
> > > > > --
> > > > > Qais Yousef
Qais Yousef Jan. 10, 2023, 4:44 p.m. UTC | #23
On 01/10/23 17:38, Vincent Guittot wrote:

[...]

> > Patch 1 of this series addresses this already :-)
> 
> Ah yes.
> 
> >
> > Talking about this serries; I'm confused what's the plan for patch 2 now?
> >
> > My understanding was Peter should pick 1 and 2 as fixes until we nail this
> > patch out.
> 
> yes patch 1 and 2 should be merged to fix mainline implementation. As
> discussed offline, the end goal remains to remove any kind of external
> for loop in load balance

Good. I'll spin v3 so it'd be easier for Peter/Ingo to pick them up.
Meanwhile I'll be testing and continuing reviewing your patch.


Thanks!

--
Qais Yousef
diff mbox series

Patch

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 69b3d61852ac..b11e7c545fc1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -31,17 +31,7 @@ 
 #include <linux/units.h>
 #include <trace/events/power.h>
 
-static LIST_HEAD(cpufreq_policy_list);
-
-/* Macros to iterate over CPU policies */
-#define for_each_suitable_policy(__policy, __active)			 \
-	list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
-		if ((__active) == !policy_is_inactive(__policy))
-
-#define for_each_active_policy(__policy)		\
-	for_each_suitable_policy(__policy, true)
-#define for_each_inactive_policy(__policy)		\
-	for_each_suitable_policy(__policy, false)
+LIST_HEAD(cpufreq_policy_list);
 
 /* Iterate over governors */
 static LIST_HEAD(cpufreq_governor_list);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d5595d57f4e5..c3c79d4ad821 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -780,6 +780,32 @@  static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 			continue;						\
 		else
 
+#ifdef CONFIG_CPU_FREQ
+extern struct list_head cpufreq_policy_list;
+
+/* Macros to iterate over CPU policies */
+#define for_each_suitable_policy(__policy, __active)			 \
+	list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
+		if ((__active) == !policy_is_inactive(__policy))
+
+#define for_each_suitable_policy_safe(__policy, __n, __active)			   \
+	list_for_each_entry_safe(__policy, __n, &cpufreq_policy_list, policy_list) \
+		if ((__active) == !policy_is_inactive(__policy))
+#else
+#define for_each_suitable_policy(__policy, __active)		while (0)
+#define for_each_suitable_policy_safe(__policy, __n, __active)	while (0)
+#endif
+
+#define for_each_active_policy(__policy)		\
+	for_each_suitable_policy(__policy, true)
+#define for_each_inactive_policy(__policy)		\
+	for_each_suitable_policy(__policy, false)
+
+#define for_each_active_policy_safe(__policy, __n)		\
+	for_each_suitable_policy_safe(__policy, __n, true)
+#define for_each_inactive_policy_safe(__policy, __n)		\
+	for_each_suitable_policy_safe(__policy, __n, false)
+
 
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c0dd57e562a..4bbbca85134b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8856,23 +8856,20 @@  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	 *   * Thermal pressure will impact all cpus in this perf domain
 	 *     equally.
 	 */
-	if (sched_energy_enabled()) {
+	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
 		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
-		struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+		struct cpufreq_policy *policy, __maybe_unused *policy_n;
 
 		rq->cpu_capacity_inverted = 0;
 
-		SCHED_WARN_ON(!rcu_read_lock_held());
-
-		for (; pd; pd = pd->next) {
-			struct cpumask *pd_span = perf_domain_span(pd);
+		for_each_active_policy_safe(policy, policy_n) {
 			unsigned long pd_cap_orig, pd_cap;
 
 			/* We can't be inverted against our own pd */
-			if (cpumask_test_cpu(cpu_of(rq), pd_span))
+			if (cpumask_test_cpu(cpu_of(rq), policy->cpus))
 				continue;
 
-			cpu = cpumask_any(pd_span);
+			cpu = cpumask_any(policy->cpus);
 			pd_cap_orig = arch_scale_cpu_capacity(cpu);
 
 			if (capacity_orig < pd_cap_orig)