[v8,03/10] sched: move cfs task on a CPU with higher capacity

Message ID 1414745252-4895-4-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot Oct. 31, 2014, 8:47 a.m.
When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
capacity for CFS tasks can be significantly reduced. Once we detect such
situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
load balance to check if it's worth moving its tasks on an idle CPU.

Once the idle load_balance has selected the busiest CPU, it will look for an
active load balance for only two cases :
- there is only 1 task on the busiest CPU.
- we haven't been able to move a task of the busiest rq.

A CPU with a reduced capacity is included in the 1st case, and it's worth to
actively migrate its task if the idle CPU has got full capacity. This test has
been added in need_active_balance.

As a sidenote, this will note generate more spurious ilb because we already
trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
has a task, we will trig the ilb once for migrating the task.

The nohz_kick_needed function has been cleaned up a bit while adding the new
test

env.src_cpu and env.src_rq must be set unconditionnally because they are used
in need_active_balance which is called even if busiest->nr_running equals 1

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 86 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 21 deletions(-)

Comments

Wanpeng Li Nov. 4, 2014, 8:30 a.m. | #1
On 14/10/31 下午4:47, Vincent Guittot wrote:
> When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
> capacity for CFS tasks can be significantly reduced. Once we detect such
> situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
> load balance to check if it's worth moving its tasks on an idle CPU.
>
> Once the idle load_balance has selected the busiest CPU, it will look for an
> active load balance for only two cases :
> - there is only 1 task on the busiest CPU.
> - we haven't been able to move a task of the busiest rq.
>
> A CPU with a reduced capacity is included in the 1st case, and it's worth to
> actively migrate its task if the idle CPU has got full capacity. This test has
> been added in need_active_balance.
>
> As a sidenote, this will note generate more spurious ilb because we already
> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
> has a task, we will trig the ilb once for migrating the task.
>
> The nohz_kick_needed function has been cleaned up a bit while adding the new
> test
>
> env.src_cpu and env.src_rq must be set unconditionnally because they are used
> in need_active_balance which is called even if busiest->nr_running equals 1
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>   kernel/sched/fair.c | 86 ++++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 65 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bd214d2..54468f3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5889,6 +5889,18 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
>   }
>   
>   /*
> + * Check whether the capacity of the rq has been noticeably reduced by side
> + * activity. The imbalance_pct is used for the threshold.
> + * Return true is the capacity is reduced
> + */
> +static inline int
> +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> +{
> +	return ((rq->cpu_capacity * sd->imbalance_pct) <
> +				(rq->cpu_capacity_orig * 100));
> +}
> +
> +/*
>    * Group imbalance indicates (and tries to solve) the problem where balancing
>    * groups is inadequate due to tsk_cpus_allowed() constraints.
>    *
> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env *env)
>   			return 1;
>   	}
>   
> +	/*
> +	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.

Why specify one task instead of not less than one?

Regards,
Wanpeng Li

> +	 * It's worth migrating the task if the src_cpu's capacity is reduced
> +	 * because of other sched_class or IRQs whereas capacity stays
> +	 * available on dst_cpu.
> +	 */
> +	if ((env->idle != CPU_NOT_IDLE) &&
> +			(env->src_rq->cfs.h_nr_running == 1)) {
> +		unsigned long src_eff_capacity, dst_eff_capacity;
> +
> +		dst_eff_capacity = 100;
> +		dst_eff_capacity *= capacity_of(env->dst_cpu);
> +		dst_eff_capacity *= capacity_orig_of(env->src_cpu);
> +
> +		src_eff_capacity = sd->imbalance_pct;
> +		src_eff_capacity *= capacity_of(env->src_cpu);
> +		src_eff_capacity *= capacity_orig_of(env->dst_cpu);
> +
> +		if (src_eff_capacity < dst_eff_capacity)
> +			return 1;
> +	}
> +
>   	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
>   }
>   
> @@ -6661,6 +6695,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>   
>   	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>   
> +	env.src_cpu = busiest->cpu;
> +	env.src_rq = busiest;
> +
>   	ld_moved = 0;
>   	if (busiest->nr_running > 1) {
>   		/*
> @@ -6670,8 +6707,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>   		 * correctly treated as an imbalance.
>   		 */
>   		env.flags |= LBF_ALL_PINNED;
> -		env.src_cpu   = busiest->cpu;
> -		env.src_rq    = busiest;
>   		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>   
>   more_balance:
> @@ -7371,22 +7406,25 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   
>   /*
>    * Current heuristic for kicking the idle load balancer in the presence
> - * of an idle cpu is the system.
> + * of an idle cpu in the system.
>    *   - This rq has more than one task.
> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
> - *     busy cpu's exceeding the group's capacity.
> + *   - This rq has at least one CFS task and the capacity of the CPU is
> + *     significantly reduced because of RT tasks or IRQs.
> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
> + *     multiple busy cpu.
>    *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>    *     domain span are idle.
>    */
> -static inline int nohz_kick_needed(struct rq *rq)
> +static inline bool nohz_kick_needed(struct rq *rq)
>   {
>   	unsigned long now = jiffies;
>   	struct sched_domain *sd;
>   	struct sched_group_capacity *sgc;
>   	int nr_busy, cpu = rq->cpu;
> +	bool kick = false;
>   
>   	if (unlikely(rq->idle_balance))
> -		return 0;
> +		return false;
>   
>          /*
>   	* We may be recently in ticked or tickless idle mode. At the first
> @@ -7400,38 +7438,44 @@ static inline int nohz_kick_needed(struct rq *rq)
>   	 * balancing.
>   	 */
>   	if (likely(!atomic_read(&nohz.nr_cpus)))
> -		return 0;
> +		return false;
>   
>   	if (time_before(now, nohz.next_balance))
> -		return 0;
> +		return false;
>   
>   	if (rq->nr_running >= 2)
> -		goto need_kick;
> +		return true;
>   
>   	rcu_read_lock();
>   	sd = rcu_dereference(per_cpu(sd_busy, cpu));
> -
>   	if (sd) {
>   		sgc = sd->groups->sgc;
>   		nr_busy = atomic_read(&sgc->nr_busy_cpus);
>   
> -		if (nr_busy > 1)
> -			goto need_kick_unlock;
> +		if (nr_busy > 1) {
> +			kick = true;
> +			goto unlock;
> +		}
> +
>   	}
>   
> -	sd = rcu_dereference(per_cpu(sd_asym, cpu));
> +	sd = rcu_dereference(rq->sd);
> +	if (sd) {
> +		if ((rq->cfs.h_nr_running >= 1) &&
> +				check_cpu_capacity(rq, sd)) {
> +			kick = true;
> +			goto unlock;
> +		}
> +	}
>   
> +	sd = rcu_dereference(per_cpu(sd_asym, cpu));
>   	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>   				  sched_domain_span(sd)) < cpu))
> -		goto need_kick_unlock;
> +		kick = true;
>   
> +unlock:
>   	rcu_read_unlock();
> -	return 0;
> -
> -need_kick_unlock:
> -	rcu_read_unlock();
> -need_kick:
> -	return 1;
> +	return kick;
>   }
>   #else
>   static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Nov. 4, 2014, 9:41 a.m. | #2
On 4 November 2014 09:30, Wanpeng Li <kernellwp@gmail.com> wrote:
>
> On 14/10/31 下午4:47, Vincent Guittot wrote:
>>
>> When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
>> capacity for CFS tasks can be significantly reduced. Once we detect such
>> situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
>> load balance to check if it's worth moving its tasks on an idle CPU.
>>
>> Once the idle load_balance has selected the busiest CPU, it will look for
>> an
>> active load balance for only two cases :
>> - there is only 1 task on the busiest CPU.
>> - we haven't been able to move a task of the busiest rq.
>>
>> A CPU with a reduced capacity is included in the 1st case, and it's worth
>> to
>> actively migrate its task if the idle CPU has got full capacity. This test
>> has
>> been added in need_active_balance.
>>
>> As a sidenote, this will note generate more spurious ilb because we
>> already
>> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one
>> that
>> has a task, we will trig the ilb once for migrating the task.
>>
>> The nohz_kick_needed function has been cleaned up a bit while adding the
>> new
>> test
>>
>> env.src_cpu and env.src_rq must be set unconditionnally because they are
>> used
>> in need_active_balance which is called even if busiest->nr_running equals
>> 1
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>   kernel/sched/fair.c | 86
>> ++++++++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 65 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index bd214d2..54468f3 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5889,6 +5889,18 @@ fix_small_capacity(struct sched_domain *sd, struct
>> sched_group *group)
>>   }
>>     /*
>> + * Check whether the capacity of the rq has been noticeably reduced by
>> side
>> + * activity. The imbalance_pct is used for the threshold.
>> + * Return true is the capacity is reduced
>> + */
>> +static inline int
>> +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
>> +{
>> +       return ((rq->cpu_capacity * sd->imbalance_pct) <
>> +                               (rq->cpu_capacity_orig * 100));
>> +}
>> +
>> +/*
>>    * Group imbalance indicates (and tries to solve) the problem where
>> balancing
>>    * groups is inadequate due to tsk_cpus_allowed() constraints.
>>    *
>> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env *env)
>>                         return 1;
>>         }
>>   +     /*
>> +        * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
>
>
> Why specify one task instead of not less than one?

if cfs.h_nr_running == 0 (which should not occurs at that point), we
don't need to do more test to check if it's worth moving the task
because there is no task to move.

Regards,
Vincent
>
> Regards,
> Wanpeng Li
>
>
>> +        * It's worth migrating the task if the src_cpu's capacity is
>> reduced
>> +        * because of other sched_class or IRQs whereas capacity stays
>> +        * available on dst_cpu.
>> +        */
>> +       if ((env->idle != CPU_NOT_IDLE) &&
>> +                       (env->src_rq->cfs.h_nr_running == 1)) {
>> +               unsigned long src_eff_capacity, dst_eff_capacity;
>> +
>> +               dst_eff_capacity = 100;
>> +               dst_eff_capacity *= capacity_of(env->dst_cpu);
>> +               dst_eff_capacity *= capacity_orig_of(env->src_cpu);
>> +
>> +               src_eff_capacity = sd->imbalance_pct;
>> +               src_eff_capacity *= capacity_of(env->src_cpu);
>> +               src_eff_capacity *= capacity_orig_of(env->dst_cpu);
>> +
>> +               if (src_eff_capacity < dst_eff_capacity)
>> +                       return 1;
>> +       }
>> +
>>         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
>>   }
>>   @@ -6661,6 +6695,9 @@ static int load_balance(int this_cpu, struct rq
>> *this_rq,
>>         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>   +     env.src_cpu = busiest->cpu;
>> +       env.src_rq = busiest;
>> +
>>         ld_moved = 0;
>>         if (busiest->nr_running > 1) {
>>                 /*
>> @@ -6670,8 +6707,6 @@ static int load_balance(int this_cpu, struct rq
>> *this_rq,
>>                  * correctly treated as an imbalance.
>>                  */
>>                 env.flags |= LBF_ALL_PINNED;
>> -               env.src_cpu   = busiest->cpu;
>> -               env.src_rq    = busiest;
>>                 env.loop_max  = min(sysctl_sched_nr_migrate,
>> busiest->nr_running);
>>     more_balance:
>> @@ -7371,22 +7406,25 @@ static void nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>     /*
>>    * Current heuristic for kicking the idle load balancer in the presence
>> - * of an idle cpu is the system.
>> + * of an idle cpu in the system.
>>    *   - This rq has more than one task.
>> - *   - At any scheduler domain level, this cpu's scheduler group has
>> multiple
>> - *     busy cpu's exceeding the group's capacity.
>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>> + *     significantly reduced because of RT tasks or IRQs.
>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler
>> group has
>> + *     multiple busy cpu.
>>    *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>    *     domain span are idle.
>>    */
>> -static inline int nohz_kick_needed(struct rq *rq)
>> +static inline bool nohz_kick_needed(struct rq *rq)
>>   {
>>         unsigned long now = jiffies;
>>         struct sched_domain *sd;
>>         struct sched_group_capacity *sgc;
>>         int nr_busy, cpu = rq->cpu;
>> +       bool kick = false;
>>         if (unlikely(rq->idle_balance))
>> -               return 0;
>> +               return false;
>>            /*
>>         * We may be recently in ticked or tickless idle mode. At the first
>> @@ -7400,38 +7438,44 @@ static inline int nohz_kick_needed(struct rq *rq)
>>          * balancing.
>>          */
>>         if (likely(!atomic_read(&nohz.nr_cpus)))
>> -               return 0;
>> +               return false;
>>         if (time_before(now, nohz.next_balance))
>> -               return 0;
>> +               return false;
>>         if (rq->nr_running >= 2)
>> -               goto need_kick;
>> +               return true;
>>         rcu_read_lock();
>>         sd = rcu_dereference(per_cpu(sd_busy, cpu));
>> -
>>         if (sd) {
>>                 sgc = sd->groups->sgc;
>>                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>   -             if (nr_busy > 1)
>> -                       goto need_kick_unlock;
>> +               if (nr_busy > 1) {
>> +                       kick = true;
>> +                       goto unlock;
>> +               }
>> +
>>         }
>>   -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>> +       sd = rcu_dereference(rq->sd);
>> +       if (sd) {
>> +               if ((rq->cfs.h_nr_running >= 1) &&
>> +                               check_cpu_capacity(rq, sd)) {
>> +                       kick = true;
>> +                       goto unlock;
>> +               }
>> +       }
>>   +     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>>                                   sched_domain_span(sd)) < cpu))
>> -               goto need_kick_unlock;
>> +               kick = true;
>>   +unlock:
>>         rcu_read_unlock();
>> -       return 0;
>> -
>> -need_kick_unlock:
>> -       rcu_read_unlock();
>> -need_kick:
>> -       return 1;
>> +       return kick;
>>   }
>>   #else
>>   static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle) { }
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Wanpeng Li Nov. 4, 2014, 10:42 a.m. | #3
Hi Vincent,
On 14/11/4 下午5:41, Vincent Guittot wrote:
> On 4 November 2014 09:30, Wanpeng Li <kernellwp@gmail.com> wrote:
>> On 14/10/31 下午4:47, Vincent Guittot wrote:
>>> When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
>>> capacity for CFS tasks can be significantly reduced. Once we detect such
>>> situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
>>> load balance to check if it's worth moving its tasks on an idle CPU.
>>>
>>> Once the idle load_balance has selected the busiest CPU, it will look for
>>> an
>>> active load balance for only two cases :
>>> - there is only 1 task on the busiest CPU.
>>> - we haven't been able to move a task of the busiest rq.
>>>
>>> A CPU with a reduced capacity is included in the 1st case, and it's worth
>>> to
>>> actively migrate its task if the idle CPU has got full capacity. This test
>>> has
>>> been added in need_active_balance.
>>>
>>> As a sidenote, this will note generate more spurious ilb because we
>>> already
>>> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one
>>> that
>>> has a task, we will trig the ilb once for migrating the task.
>>>
>>> The nohz_kick_needed function has been cleaned up a bit while adding the
>>> new
>>> test
>>>
>>> env.src_cpu and env.src_rq must be set unconditionnally because they are
>>> used
>>> in need_active_balance which is called even if busiest->nr_running equals
>>> 1
>>>
>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> ---
>>>    kernel/sched/fair.c | 86
>>> ++++++++++++++++++++++++++++++++++++++++-------------
>>>    1 file changed, 65 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index bd214d2..54468f3 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5889,6 +5889,18 @@ fix_small_capacity(struct sched_domain *sd, struct
>>> sched_group *group)
>>>    }
>>>      /*
>>> + * Check whether the capacity of the rq has been noticeably reduced by
>>> side
>>> + * activity. The imbalance_pct is used for the threshold.
>>> + * Return true is the capacity is reduced
>>> + */
>>> +static inline int
>>> +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
>>> +{
>>> +       return ((rq->cpu_capacity * sd->imbalance_pct) <
>>> +                               (rq->cpu_capacity_orig * 100));
>>> +}
>>> +
>>> +/*
>>>     * Group imbalance indicates (and tries to solve) the problem where
>>> balancing
>>>     * groups is inadequate due to tsk_cpus_allowed() constraints.
>>>     *
>>> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env *env)
>>>                          return 1;
>>>          }
>>>    +     /*
>>> +        * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
>>
>> Why specify one task instead of not less than one?
> if cfs.h_nr_running == 0 (which should not occurs at that point), we
> don't need to do more test to check if it's worth moving the task
> because there is no task to move.

Sorry for my confusing statement. I mean cfs.h_nr_running >= 1.

Regards,
Wanpeng Li

>
> Regards,
> Vincent
>> Regards,
>> Wanpeng Li
>>
>>
>>> +        * It's worth migrating the task if the src_cpu's capacity is
>>> reduced
>>> +        * because of other sched_class or IRQs whereas capacity stays
>>> +        * available on dst_cpu.
>>> +        */
>>> +       if ((env->idle != CPU_NOT_IDLE) &&
>>> +                       (env->src_rq->cfs.h_nr_running == 1)) {
>>> +               unsigned long src_eff_capacity, dst_eff_capacity;
>>> +
>>> +               dst_eff_capacity = 100;
>>> +               dst_eff_capacity *= capacity_of(env->dst_cpu);
>>> +               dst_eff_capacity *= capacity_orig_of(env->src_cpu);
>>> +
>>> +               src_eff_capacity = sd->imbalance_pct;
>>> +               src_eff_capacity *= capacity_of(env->src_cpu);
>>> +               src_eff_capacity *= capacity_orig_of(env->dst_cpu);
>>> +
>>> +               if (src_eff_capacity < dst_eff_capacity)
>>> +                       return 1;
>>> +       }
>>> +
>>>          return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
>>>    }
>>>    @@ -6661,6 +6695,9 @@ static int load_balance(int this_cpu, struct rq
>>> *this_rq,
>>>          schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>>    +     env.src_cpu = busiest->cpu;
>>> +       env.src_rq = busiest;
>>> +
>>>          ld_moved = 0;
>>>          if (busiest->nr_running > 1) {
>>>                  /*
>>> @@ -6670,8 +6707,6 @@ static int load_balance(int this_cpu, struct rq
>>> *this_rq,
>>>                   * correctly treated as an imbalance.
>>>                   */
>>>                  env.flags |= LBF_ALL_PINNED;
>>> -               env.src_cpu   = busiest->cpu;
>>> -               env.src_rq    = busiest;
>>>                  env.loop_max  = min(sysctl_sched_nr_migrate,
>>> busiest->nr_running);
>>>      more_balance:
>>> @@ -7371,22 +7406,25 @@ static void nohz_idle_balance(struct rq *this_rq,
>>> enum cpu_idle_type idle)
>>>      /*
>>>     * Current heuristic for kicking the idle load balancer in the presence
>>> - * of an idle cpu is the system.
>>> + * of an idle cpu in the system.
>>>     *   - This rq has more than one task.
>>> - *   - At any scheduler domain level, this cpu's scheduler group has
>>> multiple
>>> - *     busy cpu's exceeding the group's capacity.
>>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>>> + *     significantly reduced because of RT tasks or IRQs.
>>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler
>>> group has
>>> + *     multiple busy cpu.
>>>     *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>>     *     domain span are idle.
>>>     */
>>> -static inline int nohz_kick_needed(struct rq *rq)
>>> +static inline bool nohz_kick_needed(struct rq *rq)
>>>    {
>>>          unsigned long now = jiffies;
>>>          struct sched_domain *sd;
>>>          struct sched_group_capacity *sgc;
>>>          int nr_busy, cpu = rq->cpu;
>>> +       bool kick = false;
>>>          if (unlikely(rq->idle_balance))
>>> -               return 0;
>>> +               return false;
>>>             /*
>>>          * We may be recently in ticked or tickless idle mode. At the first
>>> @@ -7400,38 +7438,44 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>           * balancing.
>>>           */
>>>          if (likely(!atomic_read(&nohz.nr_cpus)))
>>> -               return 0;
>>> +               return false;
>>>          if (time_before(now, nohz.next_balance))
>>> -               return 0;
>>> +               return false;
>>>          if (rq->nr_running >= 2)
>>> -               goto need_kick;
>>> +               return true;
>>>          rcu_read_lock();
>>>          sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>> -
>>>          if (sd) {
>>>                  sgc = sd->groups->sgc;
>>>                  nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>>    -             if (nr_busy > 1)
>>> -                       goto need_kick_unlock;
>>> +               if (nr_busy > 1) {
>>> +                       kick = true;
>>> +                       goto unlock;
>>> +               }
>>> +
>>>          }
>>>    -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>> +       sd = rcu_dereference(rq->sd);
>>> +       if (sd) {
>>> +               if ((rq->cfs.h_nr_running >= 1) &&
>>> +                               check_cpu_capacity(rq, sd)) {
>>> +                       kick = true;
>>> +                       goto unlock;
>>> +               }
>>> +       }
>>>    +     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>>          if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>>>                                    sched_domain_span(sd)) < cpu))
>>> -               goto need_kick_unlock;
>>> +               kick = true;
>>>    +unlock:
>>>          rcu_read_unlock();
>>> -       return 0;
>>> -
>>> -need_kick_unlock:
>>> -       rcu_read_unlock();
>>> -need_kick:
>>> -       return 1;
>>> +       return kick;
>>>    }
>>>    #else
>>>    static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>>> idle) { }
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Nov. 4, 2014, 11:10 a.m. | #4
On 4 November 2014 11:42, Wanpeng Li <kernellwp@gmail.com> wrote:
> Hi Vincent,
>
>>>> +
>>>> +/*
>>>>     * Group imbalance indicates (and tries to solve) the problem where
>>>> balancing
>>>>     * groups is inadequate due to tsk_cpus_allowed() constraints.
>>>>     *
>>>> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env
>>>> *env)
>>>>                          return 1;
>>>>          }
>>>>    +     /*
>>>> +        * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
>>>
>>>
>>> Why specify one task instead of not less than one?
>>
>> if cfs.h_nr_running == 0 (which should not occurs at that point), we
>> don't need to do more test to check if it's worth moving the task
>> because there is no task to move.
>
>
> Sorry for my confusing statement. I mean cfs.h_nr_running >= 1.

ok.

If we have more than 1 task, we fall back into the default balancing
policy than should move a not running task or that will use the
imbalance field in case of the presence of pinned tasks

Regards,
Vincent

>
> Regards,
> Wanpeng Li
>
>
>>
>> Regards,
>> Vincent
>>>
>>> Regards,
>>> Wanpeng Li
>>>
>>>
>>>> +        * It's worth migrating the task if the src_cpu's capacity is
>>>> reduced
>>>> +        * because of other sched_class or IRQs whereas capacity stays
>>>> +        * available on dst_cpu.
>>>> +        */
>>>> +       if ((env->idle != CPU_NOT_IDLE) &&
>>>> +                       (env->src_rq->cfs.h_nr_running == 1)) {
>>>> +               unsigned long src_eff_capacity, dst_eff_capacity;
>>>> +
>>>> +               dst_eff_capacity = 100;
>>>> +               dst_eff_capacity *= capacity_of(env->dst_cpu);
>>>> +               dst_eff_capacity *= capacity_orig_of(env->src_cpu);
>>>> +
>>>> +               src_eff_capacity = sd->imbalance_pct;
>>>> +               src_eff_capacity *= capacity_of(env->src_cpu);
>>>> +               src_eff_capacity *= capacity_orig_of(env->dst_cpu);
>>>> +
>>>> +               if (src_eff_capacity < dst_eff_capacity)
>>>> +                       return 1;
>>>> +       }
>>>> +
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Wanpeng Li Nov. 18, 2014, 10:47 a.m. | #5
Hi Vincent,
On 10/31/14, 4:47 PM, Vincent Guittot wrote:
> When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
> capacity for CFS tasks can be significantly reduced. Once we detect such

I see the cpu capacity will be reduced if RT tasks are running in 
scale_rt_capacity(), could you point out where cpu capacity reduced due 
to IRQs?

Regards,
Wanpeng Li

> situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
> load balance to check if it's worth moving its tasks on an idle CPU.
>
> Once the idle load_balance has selected the busiest CPU, it will look for an
> active load balance for only two cases :
> - there is only 1 task on the busiest CPU.
> - we haven't been able to move a task of the busiest rq.
>
> A CPU with a reduced capacity is included in the 1st case, and it's worth to
> actively migrate its task if the idle CPU has got full capacity. This test has
> been added in need_active_balance.
>
> As a sidenote, this will note generate more spurious ilb because we already
> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
> has a task, we will trig the ilb once for migrating the task.
>
> The nohz_kick_needed function has been cleaned up a bit while adding the new
> test
>
> env.src_cpu and env.src_rq must be set unconditionnally because they are used
> in need_active_balance which is called even if busiest->nr_running equals 1
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>   kernel/sched/fair.c | 86 ++++++++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 65 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bd214d2..54468f3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5889,6 +5889,18 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
>   }
>   
>   /*
> + * Check whether the capacity of the rq has been noticeably reduced by side
> + * activity. The imbalance_pct is used for the threshold.
> + * Return true is the capacity is reduced
> + */
> +static inline int
> +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> +{
> +	return ((rq->cpu_capacity * sd->imbalance_pct) <
> +				(rq->cpu_capacity_orig * 100));
> +}
> +
> +/*
>    * Group imbalance indicates (and tries to solve) the problem where balancing
>    * groups is inadequate due to tsk_cpus_allowed() constraints.
>    *
> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env *env)
>   			return 1;
>   	}
>   
> +	/*
> +	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
> +	 * It's worth migrating the task if the src_cpu's capacity is reduced
> +	 * because of other sched_class or IRQs whereas capacity stays
> +	 * available on dst_cpu.
> +	 */
> +	if ((env->idle != CPU_NOT_IDLE) &&
> +			(env->src_rq->cfs.h_nr_running == 1)) {
> +		unsigned long src_eff_capacity, dst_eff_capacity;
> +
> +		dst_eff_capacity = 100;
> +		dst_eff_capacity *= capacity_of(env->dst_cpu);
> +		dst_eff_capacity *= capacity_orig_of(env->src_cpu);
> +
> +		src_eff_capacity = sd->imbalance_pct;
> +		src_eff_capacity *= capacity_of(env->src_cpu);
> +		src_eff_capacity *= capacity_orig_of(env->dst_cpu);
> +
> +		if (src_eff_capacity < dst_eff_capacity)
> +			return 1;
> +	}
> +
>   	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
>   }
>   
> @@ -6661,6 +6695,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>   
>   	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>   
> +	env.src_cpu = busiest->cpu;
> +	env.src_rq = busiest;
> +
>   	ld_moved = 0;
>   	if (busiest->nr_running > 1) {
>   		/*
> @@ -6670,8 +6707,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>   		 * correctly treated as an imbalance.
>   		 */
>   		env.flags |= LBF_ALL_PINNED;
> -		env.src_cpu   = busiest->cpu;
> -		env.src_rq    = busiest;
>   		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>   
>   more_balance:
> @@ -7371,22 +7406,25 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   
>   /*
>    * Current heuristic for kicking the idle load balancer in the presence
> - * of an idle cpu is the system.
> + * of an idle cpu in the system.
>    *   - This rq has more than one task.
> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
> - *     busy cpu's exceeding the group's capacity.
> + *   - This rq has at least one CFS task and the capacity of the CPU is
> + *     significantly reduced because of RT tasks or IRQs.
> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
> + *     multiple busy cpu.
>    *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>    *     domain span are idle.
>    */
> -static inline int nohz_kick_needed(struct rq *rq)
> +static inline bool nohz_kick_needed(struct rq *rq)
>   {
>   	unsigned long now = jiffies;
>   	struct sched_domain *sd;
>   	struct sched_group_capacity *sgc;
>   	int nr_busy, cpu = rq->cpu;
> +	bool kick = false;
>   
>   	if (unlikely(rq->idle_balance))
> -		return 0;
> +		return false;
>   
>          /*
>   	* We may be recently in ticked or tickless idle mode. At the first
> @@ -7400,38 +7438,44 @@ static inline int nohz_kick_needed(struct rq *rq)
>   	 * balancing.
>   	 */
>   	if (likely(!atomic_read(&nohz.nr_cpus)))
> -		return 0;
> +		return false;
>   
>   	if (time_before(now, nohz.next_balance))
> -		return 0;
> +		return false;
>   
>   	if (rq->nr_running >= 2)
> -		goto need_kick;
> +		return true;
>   
>   	rcu_read_lock();
>   	sd = rcu_dereference(per_cpu(sd_busy, cpu));
> -
>   	if (sd) {
>   		sgc = sd->groups->sgc;
>   		nr_busy = atomic_read(&sgc->nr_busy_cpus);
>   
> -		if (nr_busy > 1)
> -			goto need_kick_unlock;
> +		if (nr_busy > 1) {
> +			kick = true;
> +			goto unlock;
> +		}
> +
>   	}
>   
> -	sd = rcu_dereference(per_cpu(sd_asym, cpu));
> +	sd = rcu_dereference(rq->sd);
> +	if (sd) {
> +		if ((rq->cfs.h_nr_running >= 1) &&
> +				check_cpu_capacity(rq, sd)) {
> +			kick = true;
> +			goto unlock;
> +		}
> +	}
>   
> +	sd = rcu_dereference(per_cpu(sd_asym, cpu));
>   	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>   				  sched_domain_span(sd)) < cpu))
> -		goto need_kick_unlock;
> +		kick = true;
>   
> +unlock:
>   	rcu_read_unlock();
> -	return 0;
> -
> -need_kick_unlock:
> -	rcu_read_unlock();
> -need_kick:
> -	return 1;
> +	return kick;
>   }
>   #else
>   static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Nov. 18, 2014, 11 a.m. | #6
On 18 November 2014 11:47, Wanpeng Li <kernellwp@gmail.com> wrote:
> Hi Vincent,
> On 10/31/14, 4:47 PM, Vincent Guittot wrote:
>>
>> When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining
>> capacity for CFS tasks can be significantly reduced. Once we detect such
>
>
> I see the cpu capacity will be reduced if RT tasks are running in
> scale_rt_capacity(), could you point out where cpu capacity reduced due to
> IRQs?

with CONFIG_IRQ_TIME_ACCOUNTING, the time spent in interrupt context
will be added in rq->rt_avg

Regards,
Vincent

>
> Regards,
> Wanpeng Li
>
>
>> situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle
>> load balance to check if it's worth moving its tasks on an idle CPU.
>>
>> Once the idle load_balance has selected the busiest CPU, it will look for
>> an
>> active load balance for only two cases :
>> - there is only 1 task on the busiest CPU.
>> - we haven't been able to move a task of the busiest rq.
>>
>> A CPU with a reduced capacity is included in the 1st case, and it's worth
>> to
>> actively migrate its task if the idle CPU has got full capacity. This test
>> has
>> been added in need_active_balance.
>>
>> As a sidenote, this will note generate more spurious ilb because we
>> already
>> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one
>> that
>> has a task, we will trig the ilb once for migrating the task.
>>
>> The nohz_kick_needed function has been cleaned up a bit while adding the
>> new
>> test
>>
>> env.src_cpu and env.src_rq must be set unconditionnally because they are
>> used
>> in need_active_balance which is called even if busiest->nr_running equals
>> 1
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>   kernel/sched/fair.c | 86
>> ++++++++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 65 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index bd214d2..54468f3 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5889,6 +5889,18 @@ fix_small_capacity(struct sched_domain *sd, struct
>> sched_group *group)
>>   }
>>     /*
>> + * Check whether the capacity of the rq has been noticeably reduced by
>> side
>> + * activity. The imbalance_pct is used for the threshold.
>> + * Return true is the capacity is reduced
>> + */
>> +static inline int
>> +check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
>> +{
>> +       return ((rq->cpu_capacity * sd->imbalance_pct) <
>> +                               (rq->cpu_capacity_orig * 100));
>> +}
>> +
>> +/*
>>    * Group imbalance indicates (and tries to solve) the problem where
>> balancing
>>    * groups is inadequate due to tsk_cpus_allowed() constraints.
>>    *
>> @@ -6562,6 +6574,28 @@ static int need_active_balance(struct lb_env *env)
>>                         return 1;
>>         }
>>   +     /*
>> +        * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
>> +        * It's worth migrating the task if the src_cpu's capacity is
>> reduced
>> +        * because of other sched_class or IRQs whereas capacity stays
>> +        * available on dst_cpu.
>> +        */
>> +       if ((env->idle != CPU_NOT_IDLE) &&
>> +                       (env->src_rq->cfs.h_nr_running == 1)) {
>> +               unsigned long src_eff_capacity, dst_eff_capacity;
>> +
>> +               dst_eff_capacity = 100;
>> +               dst_eff_capacity *= capacity_of(env->dst_cpu);
>> +               dst_eff_capacity *= capacity_orig_of(env->src_cpu);
>> +
>> +               src_eff_capacity = sd->imbalance_pct;
>> +               src_eff_capacity *= capacity_of(env->src_cpu);
>> +               src_eff_capacity *= capacity_orig_of(env->dst_cpu);
>> +
>> +               if (src_eff_capacity < dst_eff_capacity)
>> +                       return 1;
>> +       }
>> +
>>         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
>>   }
>>   @@ -6661,6 +6695,9 @@ static int load_balance(int this_cpu, struct rq
>> *this_rq,
>>         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>   +     env.src_cpu = busiest->cpu;
>> +       env.src_rq = busiest;
>> +
>>         ld_moved = 0;
>>         if (busiest->nr_running > 1) {
>>                 /*
>> @@ -6670,8 +6707,6 @@ static int load_balance(int this_cpu, struct rq
>> *this_rq,
>>                  * correctly treated as an imbalance.
>>                  */
>>                 env.flags |= LBF_ALL_PINNED;
>> -               env.src_cpu   = busiest->cpu;
>> -               env.src_rq    = busiest;
>>                 env.loop_max  = min(sysctl_sched_nr_migrate,
>> busiest->nr_running);
>>     more_balance:
>> @@ -7371,22 +7406,25 @@ static void nohz_idle_balance(struct rq *this_rq,
>> enum cpu_idle_type idle)
>>     /*
>>    * Current heuristic for kicking the idle load balancer in the presence
>> - * of an idle cpu is the system.
>> + * of an idle cpu in the system.
>>    *   - This rq has more than one task.
>> - *   - At any scheduler domain level, this cpu's scheduler group has
>> multiple
>> - *     busy cpu's exceeding the group's capacity.
>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>> + *     significantly reduced because of RT tasks or IRQs.
>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler
>> group has
>> + *     multiple busy cpu.
>>    *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>    *     domain span are idle.
>>    */
>> -static inline int nohz_kick_needed(struct rq *rq)
>> +static inline bool nohz_kick_needed(struct rq *rq)
>>   {
>>         unsigned long now = jiffies;
>>         struct sched_domain *sd;
>>         struct sched_group_capacity *sgc;
>>         int nr_busy, cpu = rq->cpu;
>> +       bool kick = false;
>>         if (unlikely(rq->idle_balance))
>> -               return 0;
>> +               return false;
>>            /*
>>         * We may be recently in ticked or tickless idle mode. At the first
>> @@ -7400,38 +7438,44 @@ static inline int nohz_kick_needed(struct rq *rq)
>>          * balancing.
>>          */
>>         if (likely(!atomic_read(&nohz.nr_cpus)))
>> -               return 0;
>> +               return false;
>>         if (time_before(now, nohz.next_balance))
>> -               return 0;
>> +               return false;
>>         if (rq->nr_running >= 2)
>> -               goto need_kick;
>> +               return true;
>>         rcu_read_lock();
>>         sd = rcu_dereference(per_cpu(sd_busy, cpu));
>> -
>>         if (sd) {
>>                 sgc = sd->groups->sgc;
>>                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>   -             if (nr_busy > 1)
>> -                       goto need_kick_unlock;
>> +               if (nr_busy > 1) {
>> +                       kick = true;
>> +                       goto unlock;
>> +               }
>> +
>>         }
>>   -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>> +       sd = rcu_dereference(rq->sd);
>> +       if (sd) {
>> +               if ((rq->cfs.h_nr_running >= 1) &&
>> +                               check_cpu_capacity(rq, sd)) {
>> +                       kick = true;
>> +                       goto unlock;
>> +               }
>> +       }
>>   +     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>>                                   sched_domain_span(sd)) < cpu))
>> -               goto need_kick_unlock;
>> +               kick = true;
>>   +unlock:
>>         rcu_read_unlock();
>> -       return 0;
>> -
>> -need_kick_unlock:
>> -       rcu_read_unlock();
>> -need_kick:
>> -       return 1;
>> +       return kick;
>>   }
>>   #else
>>   static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type
>> idle) { }
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bd214d2..54468f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5889,6 +5889,18 @@  fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 }
 
 /*
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
+ */
+static inline int
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
+{
+	return ((rq->cpu_capacity * sd->imbalance_pct) <
+				(rq->cpu_capacity_orig * 100));
+}
+
+/*
  * Group imbalance indicates (and tries to solve) the problem where balancing
  * groups is inadequate due to tsk_cpus_allowed() constraints.
  *
@@ -6562,6 +6574,28 @@  static int need_active_balance(struct lb_env *env)
 			return 1;
 	}
 
+	/*
+	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+	 * It's worth migrating the task if the src_cpu's capacity is reduced
+	 * because of other sched_class or IRQs whereas capacity stays
+	 * available on dst_cpu.
+	 */
+	if ((env->idle != CPU_NOT_IDLE) &&
+			(env->src_rq->cfs.h_nr_running == 1)) {
+		unsigned long src_eff_capacity, dst_eff_capacity;
+
+		dst_eff_capacity = 100;
+		dst_eff_capacity *= capacity_of(env->dst_cpu);
+		dst_eff_capacity *= capacity_orig_of(env->src_cpu);
+
+		src_eff_capacity = sd->imbalance_pct;
+		src_eff_capacity *= capacity_of(env->src_cpu);
+		src_eff_capacity *= capacity_orig_of(env->dst_cpu);
+
+		if (src_eff_capacity < dst_eff_capacity)
+			return 1;
+	}
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -6661,6 +6695,9 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 
 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
+	env.src_cpu = busiest->cpu;
+	env.src_rq = busiest;
+
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
@@ -6670,8 +6707,6 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 		 * correctly treated as an imbalance.
 		 */
 		env.flags |= LBF_ALL_PINNED;
-		env.src_cpu   = busiest->cpu;
-		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
@@ -7371,22 +7406,25 @@  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 /*
  * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
  *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's capacity.
+ *   - This rq has at least one CFS task and the capacity of the CPU is
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
 	struct sched_group_capacity *sgc;
 	int nr_busy, cpu = rq->cpu;
+	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
-		return 0;
+		return false;
 
        /*
 	* We may be recently in ticked or tickless idle mode. At the first
@@ -7400,38 +7438,44 @@  static inline int nohz_kick_needed(struct rq *rq)
 	 * balancing.
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return 0;
+		return false;
 
 	if (time_before(now, nohz.next_balance))
-		return 0;
+		return false;
 
 	if (rq->nr_running >= 2)
-		goto need_kick;
+		return true;
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
 	if (sd) {
 		sgc = sd->groups->sgc;
 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
 
-		if (nr_busy > 1)
-			goto need_kick_unlock;
+		if (nr_busy > 1) {
+			kick = true;
+			goto unlock;
+		}
+
 	}
 
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	sd = rcu_dereference(rq->sd);
+	if (sd) {
+		if ((rq->cfs.h_nr_running >= 1) &&
+				check_cpu_capacity(rq, sd)) {
+			kick = true;
+			goto unlock;
+		}
+	}
 
+	sd = rcu_dereference(per_cpu(sd_asym, cpu));
 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
 				  sched_domain_span(sd)) < cpu))
-		goto need_kick_unlock;
+		kick = true;
 
+unlock:
 	rcu_read_unlock();
-	return 0;
-
-need_kick_unlock:
-	rcu_read_unlock();
-need_kick:
-	return 1;
+	return kick;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }