diff mbox

[v5,08/12] sched: move cfs task on a CPU with higher capacity

Message ID 1409051215-16788-9-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot Aug. 26, 2014, 11:06 a.m. UTC
If the CPU is used for handling lot of IRQs, trig a load balance to check if
it's worth moving its tasks on another CPU that has more capacity.

As a sidenote, this will note generate more spurious ilb because we already
trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
has a task, we will trig the ilb once for migrating the task.

The nohz_kick_needed function has been cleaned up a bit while adding the new
test

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 20 deletions(-)

Comments

Preeti U Murthy Aug. 30, 2014, 5:50 p.m. UTC | #1
Hi Vincent,

On 08/26/2014 04:36 PM, Vincent Guittot wrote:
> If the CPU is used for handling lot of IRQs, trig a load balance to check if
> it's worth moving its tasks on another CPU that has more capacity.
> 
> As a sidenote, this will note generate more spurious ilb because we already
> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
> has a task, we will trig the ilb once for migrating the task.
> 
> The nohz_kick_needed function has been cleaned up a bit while adding the new
> test
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 49 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 18db43e..60ae1ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  			return true;
>  	}
> 
> +	/*
> +	 * The group capacity is reduced probably because of activity from other
> +	 * sched class or interrupts which use part of the available capacity
> +	 */
> +	if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
> +				env->sd->imbalance_pct))

Wouldn't the check on avg_load let us know if we are packing more tasks
in this group than its capacity ? Isn't that the metric we are more
interested in?

> +		return true;
> +
>  	return false;
>  }
> 
> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>  	struct sched_domain *sd = env->sd;
> 
>  	if (env->idle == CPU_NEWLY_IDLE) {
> +		int src_cpu = env->src_cpu;
> 
>  		/*
>  		 * ASYM_PACKING needs to force migrate tasks from busy but
>  		 * higher numbered CPUs in order to pack all tasks in the
>  		 * lowest numbered CPUs.
>  		 */
> -		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
> +		if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
> +			return 1;
> +
> +		/*
> +		 * If the CPUs share their cache and the src_cpu's capacity is
> +		 * reduced because of other sched_class or IRQs, we trig an
> +		 * active balance to move the task
> +		 */
> +		if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
> +				sd->imbalance_pct))
>  			return 1;
>  	}
> 
> @@ -6643,6 +6661,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> 
>  	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
> 
> +	env.src_cpu = busiest->cpu;
> +
>  	ld_moved = 0;
>  	if (busiest->nr_running > 1) {
>  		/*
> @@ -6652,7 +6672,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>  		 * correctly treated as an imbalance.
>  		 */
>  		env.flags |= LBF_ALL_PINNED;
> -		env.src_cpu   = busiest->cpu;
>  		env.src_rq    = busiest;
>  		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
> 
> @@ -7359,10 +7378,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
> 
>  /*
>   * Current heuristic for kicking the idle load balancer in the presence
> - * of an idle cpu is the system.
> + * of an idle cpu in the system.
>   *   - This rq has more than one task.
> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
> - *     busy cpu's exceeding the group's capacity.
> + *   - This rq has at least one CFS task and the capacity of the CPU is
> + *     significantly reduced because of RT tasks or IRQs.
> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
> + *     multiple busy cpu.
>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>   *     domain span are idle.
>   */
> @@ -7372,9 +7393,10 @@ static inline int nohz_kick_needed(struct rq *rq)
>  	struct sched_domain *sd;
>  	struct sched_group_capacity *sgc;
>  	int nr_busy, cpu = rq->cpu;
> +	bool kick = false;
> 
>  	if (unlikely(rq->idle_balance))
> -		return 0;
> +		return false;
> 
>         /*
>  	* We may be recently in ticked or tickless idle mode. At the first
> @@ -7388,38 +7410,45 @@ static inline int nohz_kick_needed(struct rq *rq)
>  	 * balancing.
>  	 */
>  	if (likely(!atomic_read(&nohz.nr_cpus)))
> -		return 0;
> +		return false;
> 
>  	if (time_before(now, nohz.next_balance))
> -		return 0;
> +		return false;
> 
>  	if (rq->nr_running >= 2)

Will this check ^^ not catch those cases which this patch is targeting?

Regards
Preeti U Murthy

> -		goto need_kick;
> +		return true;
> 
>  	rcu_read_lock();
>  	sd = rcu_dereference(per_cpu(sd_busy, cpu));
> -
>  	if (sd) {
>  		sgc = sd->groups->sgc;
>  		nr_busy = atomic_read(&sgc->nr_busy_cpus);
> 
> -		if (nr_busy > 1)
> -			goto need_kick_unlock;
> +		if (nr_busy > 1) {
> +			kick = true;
> +			goto unlock;
> +		}
> +
>  	}
> 
> -	sd = rcu_dereference(per_cpu(sd_asym, cpu));
> +	sd = rcu_dereference(rq->sd);
> +	if (sd) {
> +		if ((rq->cfs.h_nr_running >= 1) &&
> +				((rq->cpu_capacity * sd->imbalance_pct) <
> +				(rq->cpu_capacity_orig * 100))) {
> +			kick = true;
> +			goto unlock;
> +		}
> +	}
> 
> +	sd = rcu_dereference(per_cpu(sd_asym, cpu));
>  	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>  				  sched_domain_span(sd)) < cpu))
> -		goto need_kick_unlock;
> +		kick = true;
> 
> +unlock:
>  	rcu_read_unlock();
> -	return 0;
> -
> -need_kick_unlock:
> -	rcu_read_unlock();
> -need_kick:
> -	return 1;
> +	return kick;
>  }
>  #else
>  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 1, 2014, 8:45 a.m. UTC | #2
On 30 August 2014 19:50, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> Hi Vincent,
>
> On 08/26/2014 04:36 PM, Vincent Guittot wrote:
>> If the CPU is used for handling lot of IRQs, trig a load balance to check if
>> it's worth moving its tasks on another CPU that has more capacity.
>>
>> As a sidenote, this will note generate more spurious ilb because we already
>> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
>> has a task, we will trig the ilb once for migrating the task.
>>
>> The nohz_kick_needed function has been cleaned up a bit while adding the new
>> test
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++++----------------
>>  1 file changed, 49 insertions(+), 20 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 18db43e..60ae1ce 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>                       return true;
>>       }
>>
>> +     /*
>> +      * The group capacity is reduced probably because of activity from other
>> +      * sched class or interrupts which use part of the available capacity
>> +      */
>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>> +                             env->sd->imbalance_pct))
>
> Wouldn't the check on avg_load let us know if we are packing more tasks
> in this group than its capacity ? Isn't that the metric we are more
> interested in?

With  this patch, we don't want to pack but we prefer to spread the
task on another CPU than the one which handles the interruption if
latter uses a significant part of the CPU capacity.

>
>> +             return true;
>> +
>>       return false;
>>  }
>>
>> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>>       struct sched_domain *sd = env->sd;
>>
>>       if (env->idle == CPU_NEWLY_IDLE) {
>> +             int src_cpu = env->src_cpu;
>>
>>               /*
>>                * ASYM_PACKING needs to force migrate tasks from busy but
>>                * higher numbered CPUs in order to pack all tasks in the
>>                * lowest numbered CPUs.
>>                */
>> -             if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
>> +             if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
>> +                     return 1;
>> +
>> +             /*
>> +              * If the CPUs share their cache and the src_cpu's capacity is
>> +              * reduced because of other sched_class or IRQs, we trig an
>> +              * active balance to move the task
>> +              */
>> +             if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
>> +                             sd->imbalance_pct))
>>                       return 1;
>>       }
>>
>> @@ -6643,6 +6661,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>
>>       schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>
>> +     env.src_cpu = busiest->cpu;
>> +
>>       ld_moved = 0;
>>       if (busiest->nr_running > 1) {
>>               /*
>> @@ -6652,7 +6672,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>                * correctly treated as an imbalance.
>>                */
>>               env.flags |= LBF_ALL_PINNED;
>> -             env.src_cpu   = busiest->cpu;
>>               env.src_rq    = busiest;
>>               env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>>
>> @@ -7359,10 +7378,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>>  /*
>>   * Current heuristic for kicking the idle load balancer in the presence
>> - * of an idle cpu is the system.
>> + * of an idle cpu in the system.
>>   *   - This rq has more than one task.
>> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
>> - *     busy cpu's exceeding the group's capacity.
>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>> + *     significantly reduced because of RT tasks or IRQs.
>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
>> + *     multiple busy cpu.
>>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>   *     domain span are idle.
>>   */
>> @@ -7372,9 +7393,10 @@ static inline int nohz_kick_needed(struct rq *rq)
>>       struct sched_domain *sd;
>>       struct sched_group_capacity *sgc;
>>       int nr_busy, cpu = rq->cpu;
>> +     bool kick = false;
>>
>>       if (unlikely(rq->idle_balance))
>> -             return 0;
>> +             return false;
>>
>>         /*
>>       * We may be recently in ticked or tickless idle mode. At the first
>> @@ -7388,38 +7410,45 @@ static inline int nohz_kick_needed(struct rq *rq)
>>        * balancing.
>>        */
>>       if (likely(!atomic_read(&nohz.nr_cpus)))
>> -             return 0;
>> +             return false;
>>
>>       if (time_before(now, nohz.next_balance))
>> -             return 0;
>> +             return false;
>>
>>       if (rq->nr_running >= 2)
>
> Will this check ^^ not catch those cases which this patch is targeting?

This patch is not about how many tasks are running but if the capacity
of the CPU is reduced because of side activity like interruptions
which are only visible in the capacity value (with IRQ_TIME_ACCOUNTING
config) but not in nr_running.
Even if the capacity is reduced because of RT tasks, nothing ensures
that the RT task will be running when the tick fires

Regards,
Vincent
>
> Regards
> Preeti U Murthy
>
>> -             goto need_kick;
>> +             return true;
>>
>>       rcu_read_lock();
>>       sd = rcu_dereference(per_cpu(sd_busy, cpu));
>> -
>>       if (sd) {
>>               sgc = sd->groups->sgc;
>>               nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>
>> -             if (nr_busy > 1)
>> -                     goto need_kick_unlock;
>> +             if (nr_busy > 1) {
>> +                     kick = true;
>> +                     goto unlock;
>> +             }
>> +
>>       }
>>
>> -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>> +     sd = rcu_dereference(rq->sd);
>> +     if (sd) {
>> +             if ((rq->cfs.h_nr_running >= 1) &&
>> +                             ((rq->cpu_capacity * sd->imbalance_pct) <
>> +                             (rq->cpu_capacity_orig * 100))) {
>> +                     kick = true;
>> +                     goto unlock;
>> +             }
>> +     }
>>
>> +     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>       if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>>                                 sched_domain_span(sd)) < cpu))
>> -             goto need_kick_unlock;
>> +             kick = true;
>>
>> +unlock:
>>       rcu_read_unlock();
>> -     return 0;
>> -
>> -need_kick_unlock:
>> -     rcu_read_unlock();
>> -need_kick:
>> -     return 1;
>> +     return kick;
>>  }
>>  #else
>>  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Preeti U Murthy Sept. 3, 2014, 9:11 a.m. UTC | #3
On 09/01/2014 02:15 PM, Vincent Guittot wrote:
> On 30 August 2014 19:50, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
>> Hi Vincent,
>>> index 18db43e..60ae1ce 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>                       return true;
>>>       }
>>>
>>> +     /*
>>> +      * The group capacity is reduced probably because of activity from other
>>> +      * sched class or interrupts which use part of the available capacity
>>> +      */
>>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>>> +                             env->sd->imbalance_pct))
>>
>> Wouldn't the check on avg_load let us know if we are packing more tasks
>> in this group than its capacity ? Isn't that the metric we are more
>> interested in?
> 
> With  this patch, we don't want to pack but we prefer to spread the
> task on another CPU than the one which handles the interruption if
> latter uses a significant part of the CPU capacity.
> 
>>
>>> +             return true;
>>> +
>>>       return false;
>>>  }
>>>
>>> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>>>       struct sched_domain *sd = env->sd;
>>>
>>>       if (env->idle == CPU_NEWLY_IDLE) {
>>> +             int src_cpu = env->src_cpu;
>>>
>>>               /*
>>>                * ASYM_PACKING needs to force migrate tasks from busy but
>>>                * higher numbered CPUs in order to pack all tasks in the
>>>                * lowest numbered CPUs.
>>>                */
>>> -             if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
>>> +             if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
>>> +                     return 1;
>>> +
>>> +             /*
>>> +              * If the CPUs share their cache and the src_cpu's capacity is
>>> +              * reduced because of other sched_class or IRQs, we trig an
>>> +              * active balance to move the task
>>> +              */
>>> +             if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
>>> +                             sd->imbalance_pct))
>>>                       return 1;
>>>       }
>>>
>>> @@ -6643,6 +6661,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>
>>>       schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>>
>>> +     env.src_cpu = busiest->cpu;
>>> +
>>>       ld_moved = 0;
>>>       if (busiest->nr_running > 1) {
>>>               /*
>>> @@ -6652,7 +6672,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>                * correctly treated as an imbalance.
>>>                */
>>>               env.flags |= LBF_ALL_PINNED;
>>> -             env.src_cpu   = busiest->cpu;
>>>               env.src_rq    = busiest;
>>>               env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>>>
>>> @@ -7359,10 +7378,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>
>>>  /*
>>>   * Current heuristic for kicking the idle load balancer in the presence
>>> - * of an idle cpu is the system.
>>> + * of an idle cpu in the system.
>>>   *   - This rq has more than one task.
>>> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
>>> - *     busy cpu's exceeding the group's capacity.
>>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>>> + *     significantly reduced because of RT tasks or IRQs.
>>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
>>> + *     multiple busy cpu.
>>>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>>   *     domain span are idle.
>>>   */
>>> @@ -7372,9 +7393,10 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>       struct sched_domain *sd;
>>>       struct sched_group_capacity *sgc;
>>>       int nr_busy, cpu = rq->cpu;
>>> +     bool kick = false;
>>>
>>>       if (unlikely(rq->idle_balance))
>>> -             return 0;
>>> +             return false;
>>>
>>>         /*
>>>       * We may be recently in ticked or tickless idle mode. At the first
>>> @@ -7388,38 +7410,45 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>        * balancing.
>>>        */
>>>       if (likely(!atomic_read(&nohz.nr_cpus)))
>>> -             return 0;
>>> +             return false;
>>>
>>>       if (time_before(now, nohz.next_balance))
>>> -             return 0;
>>> +             return false;
>>>
>>>       if (rq->nr_running >= 2)
>>
>> Will this check ^^ not catch those cases which this patch is targeting?
> 
> This patch is not about how many tasks are running but if the capacity
> of the CPU is reduced because of side activity like interruptions
> which are only visible in the capacity value (with IRQ_TIME_ACCOUNTING
> config) but not in nr_running.
> Even if the capacity is reduced because of RT tasks, nothing ensures
> that the RT task will be running when the tick fires
> 
> Regards,
> Vincent
>>
>> Regards
>> Preeti U Murthy
>>
>>> -             goto need_kick;
>>> +             return true;
>>>
>>>       rcu_read_lock();
>>>       sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>> -
>>>       if (sd) {
>>>               sgc = sd->groups->sgc;
>>>               nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>>
>>> -             if (nr_busy > 1)
>>> -                     goto need_kick_unlock;
>>> +             if (nr_busy > 1) {
>>> +                     kick = true;
>>> +                     goto unlock;
>>> +             }
>>> +
>>>       }
>>>
>>> -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>> +     sd = rcu_dereference(rq->sd);
>>> +     if (sd) {
>>> +             if ((rq->cfs.h_nr_running >= 1) &&
>>> +                             ((rq->cpu_capacity * sd->imbalance_pct) <
>>> +                             (rq->cpu_capacity_orig * 100))) {

Ok I understand your explanation above. But I was wondering if you would
need to add this check around rq->cfs.h_nr_running >= 1 in the above two
cases as well.

I have actually raised this concern over whether we should be using
rq->nr_running or cfs_rq->nr_running while we do load balancing in reply
to your patch3. While all our load measurements are about the cfs_rq
load, we use rq->nr_running, which may include tasks from other sched
classes as well. We divide them to get average load per task during load
balancing which is wrong, isn't it? Similarly during nohz_kick_needed(),
we trigger load balancing based on rq->nr_running again.

In this patch too, even if you know that the cpu is being dominated by
tasks that do not belong to cfs class, you would be triggering a futile
load balance if there are no fair tasks to move.

Regards
Preeti U Murthy

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 3, 2014, 11:44 a.m. UTC | #4
On 3 September 2014 11:11, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> On 09/01/2014 02:15 PM, Vincent Guittot wrote:
>> On 30 August 2014 19:50, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
>>> Hi Vincent,
>>>> index 18db43e..60ae1ce 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>>                       return true;
>>>>       }
>>>>
>>>> +     /*
>>>> +      * The group capacity is reduced probably because of activity from other
>>>> +      * sched class or interrupts which use part of the available capacity
>>>> +      */
>>>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>>>> +                             env->sd->imbalance_pct))
>>>
>>> Wouldn't the check on avg_load let us know if we are packing more tasks
>>> in this group than its capacity ? Isn't that the metric we are more
>>> interested in?
>>
>> With  this patch, we don't want to pack but we prefer to spread the
>> task on another CPU than the one which handles the interruption if
>> latter uses a significant part of the CPU capacity.
>>
>>>
>>>> +             return true;
>>>> +
>>>>       return false;
>>>>  }
>>>>
>>>> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>>>>       struct sched_domain *sd = env->sd;
>>>>
>>>>       if (env->idle == CPU_NEWLY_IDLE) {
>>>> +             int src_cpu = env->src_cpu;
>>>>
>>>>               /*
>>>>                * ASYM_PACKING needs to force migrate tasks from busy but
>>>>                * higher numbered CPUs in order to pack all tasks in the
>>>>                * lowest numbered CPUs.
>>>>                */
>>>> -             if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
>>>> +             if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
>>>> +                     return 1;
>>>> +
>>>> +             /*
>>>> +              * If the CPUs share their cache and the src_cpu's capacity is
>>>> +              * reduced because of other sched_class or IRQs, we trig an
>>>> +              * active balance to move the task
>>>> +              */
>>>> +             if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
>>>> +                             sd->imbalance_pct))
>>>>                       return 1;
>>>>       }
>>>>
>>>> @@ -6643,6 +6661,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>>
>>>>       schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>>>
>>>> +     env.src_cpu = busiest->cpu;
>>>> +
>>>>       ld_moved = 0;
>>>>       if (busiest->nr_running > 1) {
>>>>               /*
>>>> @@ -6652,7 +6672,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>>                * correctly treated as an imbalance.
>>>>                */
>>>>               env.flags |= LBF_ALL_PINNED;
>>>> -             env.src_cpu   = busiest->cpu;
>>>>               env.src_rq    = busiest;
>>>>               env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>>>>
>>>> @@ -7359,10 +7378,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>>
>>>>  /*
>>>>   * Current heuristic for kicking the idle load balancer in the presence
>>>> - * of an idle cpu is the system.
>>>> + * of an idle cpu in the system.
>>>>   *   - This rq has more than one task.
>>>> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
>>>> - *     busy cpu's exceeding the group's capacity.
>>>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>>>> + *     significantly reduced because of RT tasks or IRQs.
>>>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
>>>> + *     multiple busy cpu.
>>>>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>>>   *     domain span are idle.
>>>>   */
>>>> @@ -7372,9 +7393,10 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>>       struct sched_domain *sd;
>>>>       struct sched_group_capacity *sgc;
>>>>       int nr_busy, cpu = rq->cpu;
>>>> +     bool kick = false;
>>>>
>>>>       if (unlikely(rq->idle_balance))
>>>> -             return 0;
>>>> +             return false;
>>>>
>>>>         /*
>>>>       * We may be recently in ticked or tickless idle mode. At the first
>>>> @@ -7388,38 +7410,45 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>>        * balancing.
>>>>        */
>>>>       if (likely(!atomic_read(&nohz.nr_cpus)))
>>>> -             return 0;
>>>> +             return false;
>>>>
>>>>       if (time_before(now, nohz.next_balance))
>>>> -             return 0;
>>>> +             return false;
>>>>
>>>>       if (rq->nr_running >= 2)
>>>
>>> Will this check ^^ not catch those cases which this patch is targeting?
>>
>> This patch is not about how many tasks are running but if the capacity
>> of the CPU is reduced because of side activity like interruptions
>> which are only visible in the capacity value (with IRQ_TIME_ACCOUNTING
>> config) but not in nr_running.
>> Even if the capacity is reduced because of RT tasks, nothing ensures
>> that the RT task will be running when the tick fires
>>
>> Regards,
>> Vincent
>>>
>>> Regards
>>> Preeti U Murthy
>>>
>>>> -             goto need_kick;
>>>> +             return true;
>>>>
>>>>       rcu_read_lock();
>>>>       sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>>> -
>>>>       if (sd) {
>>>>               sgc = sd->groups->sgc;
>>>>               nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>>>
>>>> -             if (nr_busy > 1)
>>>> -                     goto need_kick_unlock;
>>>> +             if (nr_busy > 1) {
>>>> +                     kick = true;
>>>> +                     goto unlock;
>>>> +             }
>>>> +
>>>>       }
>>>>
>>>> -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>>> +     sd = rcu_dereference(rq->sd);
>>>> +     if (sd) {
>>>> +             if ((rq->cfs.h_nr_running >= 1) &&
>>>> +                             ((rq->cpu_capacity * sd->imbalance_pct) <
>>>> +                             (rq->cpu_capacity_orig * 100))) {
>
> Ok I understand your explanation above. But I was wondering if you would
> need to add this check around rq->cfs.h_nr_running >= 1 in the above two
> cases as well.

yes you're right for the test if (rq->nr_running >= 2).

It's not so straight forward for nr_busy_cpus which reflects how many
CPUs have not stopped their tick. The scheduler assumes that the
latter are busy with cfs tasks

>
> I have actually raised this concern over whether we should be using
> rq->nr_running or cfs_rq->nr_running while we do load balancing in reply
> to your patch3. While all our load measurements are about the cfs_rq

I have just replied to your comments on patch 3. Sorry for the delay

> load, we use rq->nr_running, which may include tasks from other sched
> classes as well. We divide them to get average load per task during load
> balancing which is wrong, isn't it? Similarly during nohz_kick_needed(),
> we trigger load balancing based on rq->nr_running again.
>
> In this patch too, even if you know that the cpu is being dominated by
> tasks that do not belong to cfs class, you would be triggering a futile
> load balance if there are no fair tasks to move.
This patch adds one additional condition that is based on
rq->cfs.h_nr_running so it should not trigger any futile load balance.
Then, I have also take advantage of this patch to clean up
nohz_kick_needed as proposed by Peter but the conditions are the same
than previously (except the one with rq->cfs.h_nr_running)

I can prepare another patchset that will solve the concerns that you
raised for nohz_kick_needed and in patch 3 but i would prefer not
include them in this patchset which is large enough and which subject
is a bit different.
Does it seem ok for you ?

Regards,
Vincent
>
> Regards
> Preeti U Murthy
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Preeti U Murthy Sept. 3, 2014, 12:26 p.m. UTC | #5
On 09/03/2014 05:14 PM, Vincent Guittot wrote:
> On 3 September 2014 11:11, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
>> On 09/01/2014 02:15 PM, Vincent Guittot wrote:
>>> On 30 August 2014 19:50, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
>>>> Hi Vincent,
>>>>> index 18db43e..60ae1ce 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>>>>                       return true;
>>>>>       }
>>>>>
>>>>> +     /*
>>>>> +      * The group capacity is reduced probably because of activity from other
>>>>> +      * sched class or interrupts which use part of the available capacity
>>>>> +      */
>>>>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>>>>> +                             env->sd->imbalance_pct))
>>>>
>>>> Wouldn't the check on avg_load let us know if we are packing more tasks
>>>> in this group than its capacity ? Isn't that the metric we are more
>>>> interested in?
>>>
>>> With  this patch, we don't want to pack but we prefer to spread the
>>> task on another CPU than the one which handles the interruption if
>>> latter uses a significant part of the CPU capacity.
>>>
>>>>
>>>>> +             return true;
>>>>> +
>>>>>       return false;
>>>>>  }
>>>>>
>>>>> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>>>>>       struct sched_domain *sd = env->sd;
>>>>>
>>>>>       if (env->idle == CPU_NEWLY_IDLE) {
>>>>> +             int src_cpu = env->src_cpu;
>>>>>
>>>>>               /*
>>>>>                * ASYM_PACKING needs to force migrate tasks from busy but
>>>>>                * higher numbered CPUs in order to pack all tasks in the
>>>>>                * lowest numbered CPUs.
>>>>>                */
>>>>> -             if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
>>>>> +             if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
>>>>> +                     return 1;
>>>>> +
>>>>> +             /*
>>>>> +              * If the CPUs share their cache and the src_cpu's capacity is
>>>>> +              * reduced because of other sched_class or IRQs, we trig an
>>>>> +              * active balance to move the task
>>>>> +              */
>>>>> +             if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
>>>>> +                             sd->imbalance_pct))
>>>>>                       return 1;
>>>>>       }
>>>>>
>>>>> @@ -6643,6 +6661,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>>>
>>>>>       schedstat_add(sd, lb_imbalance[idle], env.imbalance);
>>>>>
>>>>> +     env.src_cpu = busiest->cpu;
>>>>> +
>>>>>       ld_moved = 0;
>>>>>       if (busiest->nr_running > 1) {
>>>>>               /*
>>>>> @@ -6652,7 +6672,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>>>                * correctly treated as an imbalance.
>>>>>                */
>>>>>               env.flags |= LBF_ALL_PINNED;
>>>>> -             env.src_cpu   = busiest->cpu;
>>>>>               env.src_rq    = busiest;
>>>>>               env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>>>>>
>>>>> @@ -7359,10 +7378,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>>>>
>>>>>  /*
>>>>>   * Current heuristic for kicking the idle load balancer in the presence
>>>>> - * of an idle cpu is the system.
>>>>> + * of an idle cpu in the system.
>>>>>   *   - This rq has more than one task.
>>>>> - *   - At any scheduler domain level, this cpu's scheduler group has multiple
>>>>> - *     busy cpu's exceeding the group's capacity.
>>>>> + *   - This rq has at least one CFS task and the capacity of the CPU is
>>>>> + *     significantly reduced because of RT tasks or IRQs.
>>>>> + *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
>>>>> + *     multiple busy cpu.
>>>>>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>>>>>   *     domain span are idle.
>>>>>   */
>>>>> @@ -7372,9 +7393,10 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>>>       struct sched_domain *sd;
>>>>>       struct sched_group_capacity *sgc;
>>>>>       int nr_busy, cpu = rq->cpu;
>>>>> +     bool kick = false;
>>>>>
>>>>>       if (unlikely(rq->idle_balance))
>>>>> -             return 0;
>>>>> +             return false;
>>>>>
>>>>>         /*
>>>>>       * We may be recently in ticked or tickless idle mode. At the first
>>>>> @@ -7388,38 +7410,45 @@ static inline int nohz_kick_needed(struct rq *rq)
>>>>>        * balancing.
>>>>>        */
>>>>>       if (likely(!atomic_read(&nohz.nr_cpus)))
>>>>> -             return 0;
>>>>> +             return false;
>>>>>
>>>>>       if (time_before(now, nohz.next_balance))
>>>>> -             return 0;
>>>>> +             return false;
>>>>>
>>>>>       if (rq->nr_running >= 2)
>>>>
>>>> Will this check ^^ not catch those cases which this patch is targeting?
>>>
>>> This patch is not about how many tasks are running but if the capacity
>>> of the CPU is reduced because of side activity like interruptions
>>> which are only visible in the capacity value (with IRQ_TIME_ACCOUNTING
>>> config) but not in nr_running.
>>> Even if the capacity is reduced because of RT tasks, nothing ensures
>>> that the RT task will be running when the tick fires
>>>
>>> Regards,
>>> Vincent
>>>>
>>>> Regards
>>>> Preeti U Murthy
>>>>
>>>>> -             goto need_kick;
>>>>> +             return true;
>>>>>
>>>>>       rcu_read_lock();
>>>>>       sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>>>> -
>>>>>       if (sd) {
>>>>>               sgc = sd->groups->sgc;
>>>>>               nr_busy = atomic_read(&sgc->nr_busy_cpus);
>>>>>
>>>>> -             if (nr_busy > 1)
>>>>> -                     goto need_kick_unlock;
>>>>> +             if (nr_busy > 1) {
>>>>> +                     kick = true;
>>>>> +                     goto unlock;
>>>>> +             }
>>>>> +
>>>>>       }
>>>>>
>>>>> -     sd = rcu_dereference(per_cpu(sd_asym, cpu));
>>>>> +     sd = rcu_dereference(rq->sd);
>>>>> +     if (sd) {
>>>>> +             if ((rq->cfs.h_nr_running >= 1) &&
>>>>> +                             ((rq->cpu_capacity * sd->imbalance_pct) <
>>>>> +                             (rq->cpu_capacity_orig * 100))) {
>>
>> Ok I understand your explanation above. But I was wondering if you would
>> need to add this check around rq->cfs.h_nr_running >= 1 in the above two
>> cases as well.
> 
> yes you're right for the test if (rq->nr_running >= 2).
> 
> It's not so straight forward for nr_busy_cpus which reflects how many
> CPUs have not stopped their tick. The scheduler assumes that the
> latter are busy with cfs tasks
> 
>>
>> I have actually raised this concern over whether we should be using
>> rq->nr_running or cfs_rq->nr_running while we do load balancing in reply
>> to your patch3. While all our load measurements are about the cfs_rq
> 
> I have just replied to your comments on patch 3. Sorry for the delay
> 
>> load, we use rq->nr_running, which may include tasks from other sched
>> classes as well. We divide them to get average load per task during load
>> balancing which is wrong, isn't it? Similarly during nohz_kick_needed(),
>> we trigger load balancing based on rq->nr_running again.
>>
>> In this patch too, even if you know that the cpu is being dominated by
>> tasks that do not belong to cfs class, you would be triggering a futile
>> load balance if there are no fair tasks to move.
> This patch adds one additional condition that is based on
> rq->cfs.h_nr_running so it should not trigger any futile load balance.
> Then, I have also take advantage of this patch to clean up
> nohz_kick_needed as proposed by Peter but the conditions are the same
> than previously (except the one with rq->cfs.h_nr_running)
> 
> I can prepare another patchset that will solve the concerns that you
> raised for nohz_kick_needed and in patch 3 but i would prefer not
> include them in this patchset which is large enough and which subject
> is a bit different.
> Does it seem ok for you ?

Sure Vincent, thanks! I have in fact sent out a mail raising my concern
over rq->nr_running. If others agree on the issue to be existing, maybe
we can work on this next patchset that can clean this up in all places
necessary and not just in nohz_kick_needed().

Regards
Preeti U Murthy
> 
> Regards,
> Vincent
>>
>> Regards
>> Preeti U Murthy
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 3, 2014, 12:49 p.m. UTC | #6
On 3 September 2014 14:26, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> On 09/03/2014 05:14 PM, Vincent Guittot wrote:
>> On 3 September 2014 11:11, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
>>> On 09/01/2014 02:15 PM, Vincent Guittot wrote:

[snip]

>>>
>>> Ok I understand your explanation above. But I was wondering if you would
>>> need to add this check around rq->cfs.h_nr_running >= 1 in the above two
>>> cases as well.
>>
>> yes you're right for the test if (rq->nr_running >= 2).
>>
>> It's not so straight forward for nr_busy_cpus which reflects how many
>> CPUs have not stopped their tick. The scheduler assumes that the
>> latter are busy with cfs tasks
>>
>>>
>>> I have actually raised this concern over whether we should be using
>>> rq->nr_running or cfs_rq->nr_running while we do load balancing in reply
>>> to your patch3. While all our load measurements are about the cfs_rq
>>
>> I have just replied to your comments on patch 3. Sorry for the delay
>>
>>> load, we use rq->nr_running, which may include tasks from other sched
>>> classes as well. We divide them to get average load per task during load
>>> balancing which is wrong, isn't it? Similarly during nohz_kick_needed(),
>>> we trigger load balancing based on rq->nr_running again.
>>>
>>> In this patch too, even if you know that the cpu is being dominated by
>>> tasks that do not belong to cfs class, you would be triggering a futile
>>> load balance if there are no fair tasks to move.
>> This patch adds one additional condition that is based on
>> rq->cfs.h_nr_running so it should not trigger any futile load balance.
>> Then, I have also take advantage of this patch to clean up
>> nohz_kick_needed as proposed by Peter but the conditions are the same
>> than previously (except the one with rq->cfs.h_nr_running)
>>
>> I can prepare another patchset that will solve the concerns that you
>> raised for nohz_kick_needed and in patch 3 but i would prefer not
>> include them in this patchset which is large enough and which subject
>> is a bit different.
>> Does it seem ok for you ?
>
> Sure Vincent, thanks! I have in fact sent out a mail raising my concern
> over rq->nr_running. If others agree on the issue to be existing, maybe
> we can work on this next patchset that can clean this up in all places
> necessary and not just in nohz_kick_needed().

Ok, let continue this discussion on the other thread

Regards,
Vincent

>
> Regards
> Preeti U Murthy
>>
>> Regards,
>> Vincent
>>>
>>> Regards
>>> Preeti U Murthy
>>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Preeti U Murthy Sept. 5, 2014, 12:06 p.m. UTC | #7
Hi Vincent,

On 08/26/2014 04:36 PM, Vincent Guittot wrote:
> If the CPU is used for handling lot of IRQs, trig a load balance to check if
> it's worth moving its tasks on another CPU that has more capacity.
> 
> As a sidenote, this will note generate more spurious ilb because we already
> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
> has a task, we will trig the ilb once for migrating the task.
> 
> The nohz_kick_needed function has been cleaned up a bit while adding the new
> test
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

So I see that there are added checks in your previous patches on if the
cpu capacity for CFS tasks is good enough to run tasks on the cpu. My
concern is although they appear sensible, would they trigger an increase
in the number of times we load balance to a large extent.

Ebizzy would not test this aspect right? There are no real time
tasks/interrupts that get generated.

Besides, what is the column that says patchset+irq? What is the irq
accounting patchset that you refer to in your cover letter?

Regards
Preeti U Murthy

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 5, 2014, 12:24 p.m. UTC | #8
On 5 September 2014 14:06, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> Hi Vincent,
>
> On 08/26/2014 04:36 PM, Vincent Guittot wrote:
>> If the CPU is used for handling lot of IRQs, trig a load balance to check if
>> it's worth moving its tasks on another CPU that has more capacity.
>>
>> As a sidenote, this will note generate more spurious ilb because we already
>> trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that
>> has a task, we will trig the ilb once for migrating the task.
>>
>> The nohz_kick_needed function has been cleaned up a bit while adding the new
>> test
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>
> So I see that there are added checks in your previous patches on if the
> cpu capacity for CFS tasks is good enough to run tasks on the cpu. My
> concern is although they appear sensible, would they trigger an increase
> in the number of times we load balance to a large extent.
>
> Ebizzy would not test this aspect right? There are no real time
> tasks/interrupts that get generated.

yes, ebizzy doesn't test this part but check for non regression

The scp test is the one that i use to check this patch and the
previous one but a test with some cfs and rt tasks should also do the
jobs.
we can see an increase of 82% for the dual core when
CONFIG_IRQ_TIME_ACCOUNTING is enable

>
> Besides, what is the column that says patchset+irq? What is the irq
> accounting patchset that you refer to in your cover letter?

it refers to CONFIG_IRQ_TIME_ACCOUNTING which includes the time spent
under interrupt context to compute the scale_rt_capacity

Regards,
Vincent

>
> Regards
> Preeti U Murthy
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Sept. 11, 2014, 9:27 a.m. UTC | #9
On Wed, Sep 03, 2014 at 05:56:04PM +0530, Preeti U Murthy wrote:
> On 09/03/2014 05:14 PM, Vincent Guittot wrote:
> > On 3 September 2014 11:11, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> >> On 09/01/2014 02:15 PM, Vincent Guittot wrote:
> >>> On 30 August 2014 19:50, Preeti U Murthy <preeti@linux.vnet.ibm.com> wrote:
> >>>> Hi Vincent,
> >>>>> index 18db43e..60ae1ce 100644

Guys, really, trim excess crap. This email had over 150 lines of garbage
and 5 pointless quote levels or so before there was anything useful.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Sept. 11, 2014, 10:07 a.m. UTC | #10
On Tue, Aug 26, 2014 at 01:06:51PM +0200, Vincent Guittot wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 18db43e..60ae1ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  			return true;
>  	}
>  
> +	/*
> +	 * The group capacity is reduced probably because of activity from other
> +	 * sched class or interrupts which use part of the available capacity
> +	 */
> +	if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
> +				env->sd->imbalance_pct))
> +		return true;
> +
>  	return false;
>  }

This is unlikely to have worked as intended. You will never reach this,
except on PowerPC >= 7. All other machines will have bailed at the
!ASYM_PACKING check above this.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Sept. 11, 2014, 10:13 a.m. UTC | #11
On Tue, Aug 26, 2014 at 01:06:51PM +0200, Vincent Guittot wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 18db43e..60ae1ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  			return true;
>  	}
>  
> +	/*
> +	 * The group capacity is reduced probably because of activity from other
> +	 * sched class or interrupts which use part of the available capacity
> +	 */
> +	if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
> +				env->sd->imbalance_pct))
> +		return true;
> +
>  	return false;
>  }
>  
> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>  	struct sched_domain *sd = env->sd;
>  
>  	if (env->idle == CPU_NEWLY_IDLE) {
> +		int src_cpu = env->src_cpu;
>  
>  		/*
>  		 * ASYM_PACKING needs to force migrate tasks from busy but
>  		 * higher numbered CPUs in order to pack all tasks in the
>  		 * lowest numbered CPUs.
>  		 */
> -		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
> +		if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
> +			return 1;
> +
> +		/*
> +		 * If the CPUs share their cache and the src_cpu's capacity is
> +		 * reduced because of other sched_class or IRQs, we trig an
> +		 * active balance to move the task
> +		 */
> +		if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
> +				sd->imbalance_pct))
>  			return 1;
>  	}

Should you not also check -- in both cases -- that the destination is
any better?

Also, there's some obvious repetition going on there, maybe add a
helper?

Also, both sites should probably ensure they're operating in the
non-saturated/overloaded scenario. Because as soon as we're completely
saturated we want SMP nice etc. and that all already works right
(presumably).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 11, 2014, 11:20 a.m. UTC | #12
On 11 September 2014 12:07, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Aug 26, 2014 at 01:06:51PM +0200, Vincent Guittot wrote:
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 18db43e..60ae1ce 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>                       return true;
>>       }
>>
>> +     /*
>> +      * The group capacity is reduced probably because of activity from other
>> +      * sched class or interrupts which use part of the available capacity
>> +      */
>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>> +                             env->sd->imbalance_pct))
>> +             return true;
>> +
>>       return false;
>>  }
>
> This is unlikely to have worked as intended. You will never reach this,
> except on PowerPC >= 7. All other machines will have bailed at the
> !ASYM_PACKING check above this.

Ah yes, i miss that change while rebasing on rik patches. My use case
fall in this wider test now that we always select a busiest group
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Sept. 11, 2014, 11:54 a.m. UTC | #13
On Tue, Aug 26, 2014 at 01:06:51PM +0200, Vincent Guittot wrote:
> +		if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
> +				sd->imbalance_pct))

Note that capacity_orig_of() is introduced a lot later on in the series.
Patch 10 iirc, so this will not actually compile.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot Sept. 11, 2014, 12:14 p.m. UTC | #14
On 11 September 2014 12:13, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Aug 26, 2014 at 01:06:51PM +0200, Vincent Guittot wrote:
>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 18db43e..60ae1ce 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6049,6 +6049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>>                       return true;
>>       }
>>
>> +     /*
>> +      * The group capacity is reduced probably because of activity from other
>> +      * sched class or interrupts which use part of the available capacity
>> +      */
>> +     if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
>> +                             env->sd->imbalance_pct))
>> +             return true;
>> +
>>       return false;
>>  }
>>
>> @@ -6534,13 +6542,23 @@ static int need_active_balance(struct lb_env *env)
>>       struct sched_domain *sd = env->sd;
>>
>>       if (env->idle == CPU_NEWLY_IDLE) {
>> +             int src_cpu = env->src_cpu;
>>
>>               /*
>>                * ASYM_PACKING needs to force migrate tasks from busy but
>>                * higher numbered CPUs in order to pack all tasks in the
>>                * lowest numbered CPUs.
>>                */
>> -             if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
>> +             if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
>> +                     return 1;
>> +
>> +             /*
>> +              * If the CPUs share their cache and the src_cpu's capacity is
>> +              * reduced because of other sched_class or IRQs, we trig an
>> +              * active balance to move the task
>> +              */
>> +             if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
>> +                             sd->imbalance_pct))
>>                       return 1;
>>       }
>
> Should you not also check -- in both cases -- that the destination is
> any better?

The case should have been solved earlier when calculating the
imbalance which should be null if the destination is worse than the
source.

But i haven't formally check that calculate_imbalance correctly
handles that case

>
> Also, there's some obvious repetition going on there, maybe add a
> helper?

yes

>
> Also, both sites should probably ensure they're operating in the
> non-saturated/overloaded scenario. Because as soon as we're completely
> saturated we want SMP nice etc. and that all already works right
> (presumably).

If both are overloaded, calculated_imbalance will cap the max load
that can be pulled so the busiest_group will not become idle
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 18db43e..60ae1ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6049,6 +6049,14 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 			return true;
 	}
 
+	/*
+	 * The group capacity is reduced probably because of activity from other
+	 * sched class or interrupts which use part of the available capacity
+	 */
+	if ((sg->sgc->capacity_orig * 100) > (sgs->group_capacity *
+				env->sd->imbalance_pct))
+		return true;
+
 	return false;
 }
 
@@ -6534,13 +6542,23 @@  static int need_active_balance(struct lb_env *env)
 	struct sched_domain *sd = env->sd;
 
 	if (env->idle == CPU_NEWLY_IDLE) {
+		int src_cpu = env->src_cpu;
 
 		/*
 		 * ASYM_PACKING needs to force migrate tasks from busy but
 		 * higher numbered CPUs in order to pack all tasks in the
 		 * lowest numbered CPUs.
 		 */
-		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+		if ((sd->flags & SD_ASYM_PACKING) && src_cpu > env->dst_cpu)
+			return 1;
+
+		/*
+		 * If the CPUs share their cache and the src_cpu's capacity is
+		 * reduced because of other sched_class or IRQs, we trig an
+		 * active balance to move the task
+		 */
+		if ((capacity_orig_of(src_cpu) * 100) > (capacity_of(src_cpu) *
+				sd->imbalance_pct))
 			return 1;
 	}
 
@@ -6643,6 +6661,8 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 
 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
+	env.src_cpu = busiest->cpu;
+
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
@@ -6652,7 +6672,6 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 		 * correctly treated as an imbalance.
 		 */
 		env.flags |= LBF_ALL_PINNED;
-		env.src_cpu   = busiest->cpu;
 		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
@@ -7359,10 +7378,12 @@  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 
 /*
  * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
  *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's capacity.
+ *   - This rq has at least one CFS task and the capacity of the CPU is
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
@@ -7372,9 +7393,10 @@  static inline int nohz_kick_needed(struct rq *rq)
 	struct sched_domain *sd;
 	struct sched_group_capacity *sgc;
 	int nr_busy, cpu = rq->cpu;
+	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
-		return 0;
+		return false;
 
        /*
 	* We may be recently in ticked or tickless idle mode. At the first
@@ -7388,38 +7410,45 @@  static inline int nohz_kick_needed(struct rq *rq)
 	 * balancing.
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
-		return 0;
+		return false;
 
 	if (time_before(now, nohz.next_balance))
-		return 0;
+		return false;
 
 	if (rq->nr_running >= 2)
-		goto need_kick;
+		return true;
 
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
 	if (sd) {
 		sgc = sd->groups->sgc;
 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
 
-		if (nr_busy > 1)
-			goto need_kick_unlock;
+		if (nr_busy > 1) {
+			kick = true;
+			goto unlock;
+		}
+
 	}
 
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	sd = rcu_dereference(rq->sd);
+	if (sd) {
+		if ((rq->cfs.h_nr_running >= 1) &&
+				((rq->cpu_capacity * sd->imbalance_pct) <
+				(rq->cpu_capacity_orig * 100))) {
+			kick = true;
+			goto unlock;
+		}
+	}
 
+	sd = rcu_dereference(per_cpu(sd_asym, cpu));
 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
 				  sched_domain_span(sd)) < cpu))
-		goto need_kick_unlock;
+		kick = true;
 
+unlock:
 	rcu_read_unlock();
-	return 0;
-
-need_kick_unlock:
-	rcu_read_unlock();
-need_kick:
-	return 1;
+	return kick;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }