diff mbox series

[3/5] sched/fair: rework load_balance

Message ID 1563523105-24673-4-git-send-email-vincent.guittot@linaro.org
State New
Headers show
Series sched/fair: rework the CFS load balance | expand

Commit Message

Vincent Guittot July 19, 2019, 7:58 a.m. UTC
The load_balance algorithm contains some heuristics which have becomes
meaningless since the rework of metrics and the introduction of PELT.

Furthermore, it's sometimes difficult to fix wrong scheduling decisions
because everything is based on load whereas some imbalances are not
related to the load. The current algorithm ends up to create virtual and
meaningless value like the avg_load_per_task or tweaks the state of a
group to make it overloaded whereas it's not, in order to try to migrate
tasks.

load_balance should better qualify the imbalance of the group and define
cleary what has to be moved to fix this imbalance.

The type of sched_group has been extended to better reflect the type of
imbalance. We now have :
	group_has_spare
	group_fully_busy
	group_misfit_task
	group_asym_capacity
	group_imbalanced
	group_overloaded

Based on the type fo sched_group, load_balance now sets what it wants to
move in order to fix the imnbalance. It can be some load as before but
also some utilization, a number of task or a type of task:
	migrate_task
	migrate_util
	migrate_load
	migrate_misfit

This new load_balance algorithm fixes several pending wrong tasks
placement:
- the 1 task per CPU case with asymetrics system
- the case of cfs task preempted by other class
- the case of tasks not evenly spread on groups with spare capacity

The load balance decisions have been gathered in 3 functions:
- update_sd_pick_busiest() select the busiest sched_group.
- find_busiest_group() checks if there is an imabalance between local and
  busiest group.
- calculate_imbalance() decides what have to be moved.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

---
 kernel/sched/fair.c | 539 ++++++++++++++++++++++++++++------------------------
 1 file changed, 289 insertions(+), 250 deletions(-)

-- 
2.7.4

Comments

Peter Zijlstra July 19, 2019, 12:52 p.m. UTC | #1
On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
> @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

>  enum fbq_type { regular, remote, all };

>  

>  enum group_type {

> -	group_other = 0,

> +	group_has_spare = 0,

> +	group_fully_busy,

>  	group_misfit_task,

> +	group_asym_capacity,

>  	group_imbalanced,

>  	group_overloaded,

>  };

>  

> +enum group_migration {

> +	migrate_task = 0,

> +	migrate_util,

> +	migrate_load,

> +	migrate_misfit,

> +};

> +

>  #define LBF_ALL_PINNED	0x01

>  #define LBF_NEED_BREAK	0x02

>  #define LBF_DST_PINNED  0x04

> @@ -7096,7 +7093,7 @@ struct lb_env {

>  	unsigned int		loop_max;

>  

>  	enum fbq_type		fbq_type;

> -	enum group_type		src_grp_type;

> +	enum group_migration	src_grp_type;

>  	struct list_head	tasks;

>  };

>  

> @@ -7328,7 +7325,6 @@ static int detach_tasks(struct lb_env *env)

>  {

>  	struct list_head *tasks = &env->src_rq->cfs_tasks;

>  	struct task_struct *p;

> -	unsigned long load;

>  	int detached = 0;

>  

>  	lockdep_assert_held(&env->src_rq->lock);

> @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

>  		if (!can_migrate_task(p, env))

>  			goto next;

>  

> -		load = task_h_load(p);

> +		if (env->src_grp_type == migrate_load) {

> +			unsigned long load = task_h_load(p);

>  

> -		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> -			goto next;

> +			if (sched_feat(LB_MIN) &&

> +			    load < 16 && !env->sd->nr_balance_failed)

> +				goto next;

> +

> +			if ((load / 2) > env->imbalance)

> +				goto next;

> +

> +			env->imbalance -= load;

> +		} else	if (env->src_grp_type == migrate_util) {

> +			unsigned long util = task_util_est(p);

> +

> +			if (util > env->imbalance)

> +				goto next;

> +

> +			env->imbalance -= util;

> +		} else if (env->src_grp_type == migrate_misfit) {

> +			unsigned long util = task_util_est(p);

> +

> +			/*

> +			 * utilization of misfit task might decrease a bit

> +			 * since it has been recorded. Be conservative in the

> +			 * condition.

> +			 */

> +			if (2*util < env->imbalance)

> +				goto next;

> +

> +			env->imbalance = 0;

> +		} else {

> +			/* Migrate task */

> +			env->imbalance--;

> +		}

>  

> -		if ((load / 2) > env->imbalance)

> -			goto next;

>  

>  		detach_task(p, env);

>  		list_add(&p->se.group_node, &env->tasks);

>  

>  		detached++;

> -		env->imbalance -= load;

>  

>  #ifdef CONFIG_PREEMPT

>  		/*


Still reading through this; maybe something like so instead?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7057,7 +7057,7 @@ enum group_type {
 	group_overloaded,
 };
 
-enum group_migration {
+enum migration_type {
 	migrate_task = 0,
 	migrate_util,
 	migrate_load,
@@ -7094,7 +7094,7 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
-	enum group_migration	src_grp_type;
+	enum migration_type	balance_type;
 	struct list_head	tasks;
 };
 
@@ -7325,6 +7325,7 @@ static const unsigned int sched_nr_migra
 static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
+	unsigned long load, util;
 	struct task_struct *p;
 	int detached = 0;
 
@@ -7358,8 +7359,14 @@ static int detach_tasks(struct lb_env *e
 		if (!can_migrate_task(p, env))
 			goto next;
 
-		if (env->src_grp_type == migrate_load) {
-			unsigned long load = task_h_load(p);
+		switch (env->balance_type) {
+		case migrate_task:
+			/* Migrate task */
+			env->imbalance--;
+			break;
+
+		case migrate_load:
+			load = task_h_load(p);
 
 			if (sched_feat(LB_MIN) &&
 			    load < 16 && !env->sd->nr_balance_failed)
@@ -7369,15 +7376,20 @@ static int detach_tasks(struct lb_env *e
 				goto next;
 
 			env->imbalance -= load;
-		} else	if (env->src_grp_type == migrate_util) {
-			unsigned long util = task_util_est(p);
+			break;
+
+		case migrate_util:
+			util = task_util_est(p);
 
 			if (util > env->imbalance)
 				goto next;
 
 			env->imbalance -= util;
-		} else if (env->src_grp_type == migrate_misfit) {
-			unsigned long util = task_util_est(p);
+			break;
+
+
+		case migrate_misfit:
+			util = task_util_est(p);
 
 			/*
 			 * utilization of misfit task might decrease a bit
@@ -7388,9 +7400,7 @@ static int detach_tasks(struct lb_env *e
 				goto next;
 
 			env->imbalance = 0;
-		} else {
-			/* Migrate task */
-			env->imbalance--;
+			break;
 		}
 
 
@@ -8311,7 +8321,7 @@ static inline void calculate_imbalance(s
 		 * In case of asym capacity, we will try to migrate all load
 		 * to the preferred CPU
 		 */
-		env->src_grp_type = migrate_load;
+		env->balance_type = migrate_load;
 		env->imbalance = busiest->group_load;
 		return;
 	}
@@ -8323,14 +8333,14 @@ static inline void calculate_imbalance(s
 		 * the imbalance. The next load balance will take care of
 		 * balancing back the system.
 		 */
-		env->src_grp_type = migrate_task;
+		env->balance_type = migrate_task;
 		env->imbalance = 1;
 		return;
 	}
 
 	if (busiest->group_type == group_misfit_task) {
 		/* Set imbalance to allow misfit task to be balanced. */
-		env->src_grp_type = migrate_misfit;
+		env->balance_type = migrate_misfit;
 		env->imbalance = busiest->group_misfit_task_load;
 		return;
 	}
@@ -8346,7 +8356,7 @@ static inline void calculate_imbalance(s
 		 * If there is no overload, we just want to even the number of
 		 * idle cpus.
 		 */
-		env->src_grp_type = migrate_task;
+		env->balance_type = migrate_task;
 		imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);
 
 		if (sds->prefer_sibling)
@@ -8365,7 +8375,7 @@ static inline void calculate_imbalance(s
 			 * amount of load to migrate in order to balance the
 			 * system.
 			 */
-			env->src_grp_type = migrate_util;
+			env->balance_type = migrate_util;
 			imbalance = max(local->group_capacity, local->group_util) -
 				    local->group_util;
 		}
@@ -8399,7 +8409,7 @@ static inline void calculate_imbalance(s
 	 * don't want to reduce the group load below the group capacity.
 	 * Thus we look for the minimum possible imbalance.
 	 */
-	env->src_grp_type = migrate_load;
+	env->balance_type = migrate_load;
 	env->imbalance = min(
 		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
 		(sds->avg_load - local->avg_load) * local->group_capacity
@@ -8597,7 +8607,7 @@ static struct rq *find_busiest_queue(str
 		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
 		 * seek the "biggest" misfit task.
 		 */
-		if (env->src_grp_type == migrate_misfit) {
+		if (env->balance_type == migrate_misfit) {
 			if (rq->misfit_task_load > busiest_load) {
 				busiest_load = rq->misfit_task_load;
 				busiest = rq;
@@ -8619,7 +8629,7 @@ static struct rq *find_busiest_queue(str
 		    rq->nr_running == 1)
 			continue;
 
-		if (env->src_grp_type == migrate_task) {
+		if (env->balance_type == migrate_task) {
 			nr_running = rq->cfs.h_nr_running;
 
 			if (busiest_nr < nr_running) {
@@ -8630,7 +8640,7 @@ static struct rq *find_busiest_queue(str
 			continue;
 		}
 
-		if (env->src_grp_type == migrate_util) {
+		if (env->balance_type == migrate_util) {
 			util = cpu_util(cpu_of(rq));
 
 			if (busiest_util < util) {
@@ -8711,7 +8721,7 @@ voluntary_active_balance(struct lb_env *
 			return 1;
 	}
 
-	if (env->src_grp_type == migrate_misfit)
+	if (env->balance_type == migrate_misfit)
 		return 1;
 
 	return 0;
Peter Zijlstra July 19, 2019, 12:54 p.m. UTC | #2
On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 67f0acd..472959df 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -5376,18 +5376,6 @@ static unsigned long capacity_of(int cpu)

>  	return cpu_rq(cpu)->cpu_capacity;

>  }

>  

> -static unsigned long cpu_avg_load_per_task(int cpu)

> -{

> -	struct rq *rq = cpu_rq(cpu);

> -	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);

> -	unsigned long load_avg = cpu_runnable_load(rq);

> -

> -	if (nr_running)

> -		return load_avg / nr_running;

> -

> -	return 0;

> -}

> -

>  static void record_wakee(struct task_struct *p)

>  {

>  	/*


> @@ -7646,7 +7669,6 @@ static unsigned long task_h_load(struct task_struct *p)

>  struct sg_lb_stats {

>  	unsigned long avg_load; /*Avg load across the CPUs of the group */

>  	unsigned long group_load; /* Total load over the CPUs of the group */

> -	unsigned long load_per_task;

>  	unsigned long group_capacity;

>  	unsigned long group_util; /* Total utilization of the group */

>  	unsigned int sum_nr_running; /* Nr tasks running in the group */



> @@ -8266,76 +8293,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

>  }

>  

>  /**

> - * fix_small_imbalance - Calculate the minor imbalance that exists

> - *			amongst the groups of a sched_domain, during

> - *			load balancing.

> - * @env: The load balancing environment.

> - * @sds: Statistics of the sched_domain whose imbalance is to be calculated.

> - */

> -static inline

> -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

> -{

> -	unsigned long tmp, capa_now = 0, capa_move = 0;

> -	unsigned int imbn = 2;

> -	unsigned long scaled_busy_load_per_task;

> -	struct sg_lb_stats *local, *busiest;

> -

> -	local = &sds->local_stat;

> -	busiest = &sds->busiest_stat;

> -

> -	if (!local->sum_h_nr_running)

> -		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

> -	else if (busiest->load_per_task > local->load_per_task)

> -		imbn = 1;

> -

> -	scaled_busy_load_per_task =

> -		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /

> -		busiest->group_capacity;

> -

> -	if (busiest->avg_load + scaled_busy_load_per_task >=

> -	    local->avg_load + (scaled_busy_load_per_task * imbn)) {

> -		env->imbalance = busiest->load_per_task;

> -		return;

> -	}

> -

> -	/*

> -	 * OK, we don't have enough imbalance to justify moving tasks,

> -	 * however we may be able to increase total CPU capacity used by

> -	 * moving them.

> -	 */

> -

> -	capa_now += busiest->group_capacity *

> -			min(busiest->load_per_task, busiest->avg_load);

> -	capa_now += local->group_capacity *

> -			min(local->load_per_task, local->avg_load);

> -	capa_now /= SCHED_CAPACITY_SCALE;

> -

> -	/* Amount of load we'd subtract */

> -	if (busiest->avg_load > scaled_busy_load_per_task) {

> -		capa_move += busiest->group_capacity *

> -			    min(busiest->load_per_task,

> -				busiest->avg_load - scaled_busy_load_per_task);

> -	}

> -

> -	/* Amount of load we'd add */

> -	if (busiest->avg_load * busiest->group_capacity <

> -	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {

> -		tmp = (busiest->avg_load * busiest->group_capacity) /

> -		      local->group_capacity;

> -	} else {

> -		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /

> -		      local->group_capacity;

> -	}

> -	capa_move += local->group_capacity *

> -		    min(local->load_per_task, local->avg_load + tmp);

> -	capa_move /= SCHED_CAPACITY_SCALE;

> -

> -	/* Move if we gain throughput */

> -	if (capa_move > capa_now)

> -		env->imbalance = busiest->load_per_task;

> -}

> -

> -/**

>   * calculate_imbalance - Calculate the amount of imbalance present within the

>   *			 groups of a given sched_domain during load balance.

>   * @env: load balance environment


Maybe strip this out first, in a separate patch. It's all magic doo-doo.
Peter Zijlstra July 19, 2019, 1:06 p.m. UTC | #3
On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
> @@ -7887,7 +7908,7 @@ static inline int sg_imbalanced(struct sched_group *group)

>  static inline bool

>  group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)

>  {

> -	if (sgs->sum_h_nr_running < sgs->group_weight)

> +	if (sgs->sum_nr_running < sgs->group_weight)

>  		return true;

>  

>  	if ((sgs->group_capacity * 100) >

> @@ -7908,7 +7929,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)

>  static inline bool

>  group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)

>  {

> -	if (sgs->sum_h_nr_running <= sgs->group_weight)

> +	if (sgs->sum_nr_running <= sgs->group_weight)

>  		return false;

>  

>  	if ((sgs->group_capacity * 100) <


I suspect this is a change you can pull out into a separate patch after
the big change. Yes it makes sense to account the other class' task
presence, but I don't think it is strictly required to be in this patch.
Peter Zijlstra July 19, 2019, 1:12 p.m. UTC | #4
On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

> @@ -8029,17 +8063,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,

>  		}

>  	}

>  

> -	/* Adjust by relative CPU capacity of the group */

> -	sgs->group_capacity = group->sgc->capacity;

> -	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

> +	/* Check if dst cpu is idle and preferred to this group */

> +	if (env->sd->flags & SD_ASYM_PACKING &&

> +	    env->idle != CPU_NOT_IDLE &&

> +	    sgs->sum_h_nr_running &&

> +	    sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {

> +		sgs->group_asym_capacity = 1;

> +	}

>  

> -	if (sgs->sum_h_nr_running)

> -		sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> +	sgs->group_capacity = group->sgc->capacity;

>  

>  	sgs->group_weight = group->group_weight;

>  

> -	sgs->group_no_capacity = group_is_overloaded(env, sgs);

> -	sgs->group_type = group_classify(group, sgs);

> +	sgs->group_type = group_classify(env, group, sgs);

> +

> +	/* Computing avg_load makes sense only when group is overloaded */

> +	if (sgs->group_type != group_overloaded)


The comment seems to suggest you meant: ==

> +		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> +				sgs->group_capacity;

>  }

>  

>  /**

> @@ -8070,7 +8111,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,

>  	 */

>  	if (sgs->group_type == group_misfit_task &&

>  	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||

> -	     !group_has_capacity(env, &sds->local_stat)))

> +	     sds->local_stat.group_type != group_has_spare))

>  		return false;

>  

>  	if (sgs->group_type > busiest->group_type)

> @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

>  	if (sgs->group_type < busiest->group_type)

>  		return false;

>  

> -	if (sgs->avg_load <= busiest->avg_load)

> +	/* Select the overloaded group with highest avg_load */

> +	if (sgs->group_type == group_overloaded &&

> +	    sgs->avg_load <= busiest->avg_load)


And this code does too; because with the above '!=', you're comparing
uninitialized data here, no?

> +		return false;

> +

> +	/* Prefer to move from lowest priority CPU's work */

> +	if (sgs->group_type == group_asym_capacity && sds->busiest &&

> +	    sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

>  		return false;

>  

>  	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))

> -		goto asym_packing;

> +		goto spare_capacity;

>  

>  	/*

>  	 * Candidate sg has no more than one task per CPU and
Peter Zijlstra July 19, 2019, 1:22 p.m. UTC | #5
On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:
>  enum group_type {

> -	group_other = 0,

> +	group_has_spare = 0,

> +	group_fully_busy,

>  	group_misfit_task,

> +	group_asym_capacity,

>  	group_imbalanced,

>  	group_overloaded,

>  };


The order of this group_type is important, maybe add a few words on how
this order got to be.

>  static inline enum

> -group_type group_classify(struct sched_group *group,

> +group_type group_classify(struct lb_env *env,

> +			  struct sched_group *group,

>  			  struct sg_lb_stats *sgs)

>  {

> -	if (sgs->group_no_capacity)

> +	if (group_is_overloaded(env, sgs))

>  		return group_overloaded;

>  

>  	if (sg_imbalanced(group))

> @@ -7953,7 +7975,13 @@ group_type group_classify(struct sched_group *group,

>  	if (sgs->group_misfit_task_load)

>  		return group_misfit_task;

>  

> -	return group_other;

> +	if (sgs->group_asym_capacity)

> +		return group_asym_capacity;

> +

> +	if (group_has_capacity(env, sgs))

> +		return group_has_spare;

> +

> +	return group_fully_busy;

>  }


OCD is annoyed that this function doesn't have the same / reverse order
of the one in the enum.

> @@ -8070,7 +8111,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,

>  	 */

>  	if (sgs->group_type == group_misfit_task &&

>  	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||

> -	     !group_has_capacity(env, &sds->local_stat)))

> +	     sds->local_stat.group_type != group_has_spare))

>  		return false;

>  

>  	if (sgs->group_type > busiest->group_type)

> @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

>  	if (sgs->group_type < busiest->group_type)

>  		return false;


from reading the patch it wasn't obvious that at this point
sgs->group_type == busiest->group_type, and I wondered if
busiest->avg_load below was pointing to garbage, it isn't.

> -	if (sgs->avg_load <= busiest->avg_load)

> +	/* Select the overloaded group with highest avg_load */

> +	if (sgs->group_type == group_overloaded &&

> +	    sgs->avg_load <= busiest->avg_load)

> +		return false;

> +

> +	/* Prefer to move from lowest priority CPU's work */

> +	if (sgs->group_type == group_asym_capacity && sds->busiest &&

> +	    sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

>  		return false;

>  

>  	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))

> -		goto asym_packing;

> +		goto spare_capacity;

>  

>  	/*

>  	 * Candidate sg has no more than one task per CPU and


Can we do a switch (sds->group_type) here? it seems to have most of them
listed.
Vincent Guittot July 19, 2019, 1:46 p.m. UTC | #6
On Fri, 19 Jul 2019 at 14:52, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

> > @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

> >  enum fbq_type { regular, remote, all };

> >

> >  enum group_type {

> > -     group_other = 0,

> > +     group_has_spare = 0,

> > +     group_fully_busy,

> >       group_misfit_task,

> > +     group_asym_capacity,

> >       group_imbalanced,

> >       group_overloaded,

> >  };

> >

> > +enum group_migration {

> > +     migrate_task = 0,

> > +     migrate_util,

> > +     migrate_load,

> > +     migrate_misfit,

> > +};

> > +

> >  #define LBF_ALL_PINNED       0x01

> >  #define LBF_NEED_BREAK       0x02

> >  #define LBF_DST_PINNED  0x04

> > @@ -7096,7 +7093,7 @@ struct lb_env {

> >       unsigned int            loop_max;

> >

> >       enum fbq_type           fbq_type;

> > -     enum group_type         src_grp_type;

> > +     enum group_migration    src_grp_type;

> >       struct list_head        tasks;

> >  };

> >

> > @@ -7328,7 +7325,6 @@ static int detach_tasks(struct lb_env *env)

> >  {

> >       struct list_head *tasks = &env->src_rq->cfs_tasks;

> >       struct task_struct *p;

> > -     unsigned long load;

> >       int detached = 0;

> >

> >       lockdep_assert_held(&env->src_rq->lock);

> > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

> >               if (!can_migrate_task(p, env))

> >                       goto next;

> >

> > -             load = task_h_load(p);

> > +             if (env->src_grp_type == migrate_load) {

> > +                     unsigned long load = task_h_load(p);

> >

> > -             if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> > -                     goto next;

> > +                     if (sched_feat(LB_MIN) &&

> > +                         load < 16 && !env->sd->nr_balance_failed)

> > +                             goto next;

> > +

> > +                     if ((load / 2) > env->imbalance)

> > +                             goto next;

> > +

> > +                     env->imbalance -= load;

> > +             } else  if (env->src_grp_type == migrate_util) {

> > +                     unsigned long util = task_util_est(p);

> > +

> > +                     if (util > env->imbalance)

> > +                             goto next;

> > +

> > +                     env->imbalance -= util;

> > +             } else if (env->src_grp_type == migrate_misfit) {

> > +                     unsigned long util = task_util_est(p);

> > +

> > +                     /*

> > +                      * utilization of misfit task might decrease a bit

> > +                      * since it has been recorded. Be conservative in the

> > +                      * condition.

> > +                      */

> > +                     if (2*util < env->imbalance)

> > +                             goto next;

> > +

> > +                     env->imbalance = 0;

> > +             } else {

> > +                     /* Migrate task */

> > +                     env->imbalance--;

> > +             }

> >

> > -             if ((load / 2) > env->imbalance)

> > -                     goto next;

> >

> >               detach_task(p, env);

> >               list_add(&p->se.group_node, &env->tasks);

> >

> >               detached++;

> > -             env->imbalance -= load;

> >

> >  #ifdef CONFIG_PREEMPT

> >               /*

>

> Still reading through this; maybe something like so instead?


yes, looks easier to read
>

> ---

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -7057,7 +7057,7 @@ enum group_type {

>         group_overloaded,

>  };

>

> -enum group_migration {

> +enum migration_type {

>         migrate_task = 0,

>         migrate_util,

>         migrate_load,

> @@ -7094,7 +7094,7 @@ struct lb_env {

>         unsigned int            loop_max;

>

>         enum fbq_type           fbq_type;

> -       enum group_migration    src_grp_type;

> +       enum migration_type     balance_type;

>         struct list_head        tasks;

>  };

>

> @@ -7325,6 +7325,7 @@ static const unsigned int sched_nr_migra

>  static int detach_tasks(struct lb_env *env)

>  {

>         struct list_head *tasks = &env->src_rq->cfs_tasks;

> +       unsigned long load, util;

>         struct task_struct *p;

>         int detached = 0;

>

> @@ -7358,8 +7359,14 @@ static int detach_tasks(struct lb_env *e

>                 if (!can_migrate_task(p, env))

>                         goto next;

>

> -               if (env->src_grp_type == migrate_load) {

> -                       unsigned long load = task_h_load(p);

> +               switch (env->balance_type) {

> +               case migrate_task:

> +                       /* Migrate task */

> +                       env->imbalance--;

> +                       break;

> +

> +               case migrate_load:

> +                       load = task_h_load(p);

>

>                         if (sched_feat(LB_MIN) &&

>                             load < 16 && !env->sd->nr_balance_failed)

> @@ -7369,15 +7376,20 @@ static int detach_tasks(struct lb_env *e

>                                 goto next;

>

>                         env->imbalance -= load;

> -               } else  if (env->src_grp_type == migrate_util) {

> -                       unsigned long util = task_util_est(p);

> +                       break;

> +

> +               case migrate_util:

> +                       util = task_util_est(p);

>

>                         if (util > env->imbalance)

>                                 goto next;

>

>                         env->imbalance -= util;

> -               } else if (env->src_grp_type == migrate_misfit) {

> -                       unsigned long util = task_util_est(p);

> +                       break;

> +

> +

> +               case migrate_misfit:

> +                       util = task_util_est(p);

>

>                         /*

>                          * utilization of misfit task might decrease a bit

> @@ -7388,9 +7400,7 @@ static int detach_tasks(struct lb_env *e

>                                 goto next;

>

>                         env->imbalance = 0;

> -               } else {

> -                       /* Migrate task */

> -                       env->imbalance--;

> +                       break;

>                 }

>

>

> @@ -8311,7 +8321,7 @@ static inline void calculate_imbalance(s

>                  * In case of asym capacity, we will try to migrate all load

>                  * to the preferred CPU

>                  */

> -               env->src_grp_type = migrate_load;

> +               env->balance_type = migrate_load;

>                 env->imbalance = busiest->group_load;

>                 return;

>         }

> @@ -8323,14 +8333,14 @@ static inline void calculate_imbalance(s

>                  * the imbalance. The next load balance will take care of

>                  * balancing back the system.

>                  */

> -               env->src_grp_type = migrate_task;

> +               env->balance_type = migrate_task;

>                 env->imbalance = 1;

>                 return;

>         }

>

>         if (busiest->group_type == group_misfit_task) {

>                 /* Set imbalance to allow misfit task to be balanced. */

> -               env->src_grp_type = migrate_misfit;

> +               env->balance_type = migrate_misfit;

>                 env->imbalance = busiest->group_misfit_task_load;

>                 return;

>         }

> @@ -8346,7 +8356,7 @@ static inline void calculate_imbalance(s

>                  * If there is no overload, we just want to even the number of

>                  * idle cpus.

>                  */

> -               env->src_grp_type = migrate_task;

> +               env->balance_type = migrate_task;

>                 imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

>

>                 if (sds->prefer_sibling)

> @@ -8365,7 +8375,7 @@ static inline void calculate_imbalance(s

>                          * amount of load to migrate in order to balance the

>                          * system.

>                          */

> -                       env->src_grp_type = migrate_util;

> +                       env->balance_type = migrate_util;

>                         imbalance = max(local->group_capacity, local->group_util) -

>                                     local->group_util;

>                 }

> @@ -8399,7 +8409,7 @@ static inline void calculate_imbalance(s

>          * don't want to reduce the group load below the group capacity.

>          * Thus we look for the minimum possible imbalance.

>          */

> -       env->src_grp_type = migrate_load;

> +       env->balance_type = migrate_load;

>         env->imbalance = min(

>                 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,

>                 (sds->avg_load - local->avg_load) * local->group_capacity

> @@ -8597,7 +8607,7 @@ static struct rq *find_busiest_queue(str

>                  * For ASYM_CPUCAPACITY domains with misfit tasks we simply

>                  * seek the "biggest" misfit task.

>                  */

> -               if (env->src_grp_type == migrate_misfit) {

> +               if (env->balance_type == migrate_misfit) {

>                         if (rq->misfit_task_load > busiest_load) {

>                                 busiest_load = rq->misfit_task_load;

>                                 busiest = rq;

> @@ -8619,7 +8629,7 @@ static struct rq *find_busiest_queue(str

>                     rq->nr_running == 1)

>                         continue;

>

> -               if (env->src_grp_type == migrate_task) {

> +               if (env->balance_type == migrate_task) {

>                         nr_running = rq->cfs.h_nr_running;

>

>                         if (busiest_nr < nr_running) {

> @@ -8630,7 +8640,7 @@ static struct rq *find_busiest_queue(str

>                         continue;

>                 }

>

> -               if (env->src_grp_type == migrate_util) {

> +               if (env->balance_type == migrate_util) {

>                         util = cpu_util(cpu_of(rq));

>

>                         if (busiest_util < util) {

> @@ -8711,7 +8721,7 @@ voluntary_active_balance(struct lb_env *

>                         return 1;

>         }

>

> -       if (env->src_grp_type == migrate_misfit)

> +       if (env->balance_type == migrate_misfit)

>                 return 1;

>

>         return 0;
Vincent Guittot July 19, 2019, 1:55 p.m. UTC | #7
On Fri, 19 Jul 2019 at 15:22, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

> >  enum group_type {

> > -     group_other = 0,

> > +     group_has_spare = 0,

> > +     group_fully_busy,

> >       group_misfit_task,

> > +     group_asym_capacity,

> >       group_imbalanced,

> >       group_overloaded,

> >  };

>

> The order of this group_type is important, maybe add a few words on how

> this order got to be.


Yes, I will
>

> >  static inline enum

> > -group_type group_classify(struct sched_group *group,

> > +group_type group_classify(struct lb_env *env,

> > +                       struct sched_group *group,

> >                         struct sg_lb_stats *sgs)

> >  {

> > -     if (sgs->group_no_capacity)

> > +     if (group_is_overloaded(env, sgs))

> >               return group_overloaded;

> >

> >       if (sg_imbalanced(group))

> > @@ -7953,7 +7975,13 @@ group_type group_classify(struct sched_group *group,

> >       if (sgs->group_misfit_task_load)

> >               return group_misfit_task;

> >

> > -     return group_other;

> > +     if (sgs->group_asym_capacity)

> > +             return group_asym_capacity;

> > +

> > +     if (group_has_capacity(env, sgs))

> > +             return group_has_spare;

> > +

> > +     return group_fully_busy;

> >  }

>

> OCD is annoyed that this function doesn't have the same / reverse order

> of the one in the enum.


I will reorder them

>

> > @@ -8070,7 +8111,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,

> >        */

> >       if (sgs->group_type == group_misfit_task &&

> >           (!group_smaller_max_cpu_capacity(sg, sds->local) ||

> > -          !group_has_capacity(env, &sds->local_stat)))

> > +          sds->local_stat.group_type != group_has_spare))

> >               return false;

> >

> >       if (sgs->group_type > busiest->group_type)

> > @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

> >       if (sgs->group_type < busiest->group_type)

> >               return false;

>

> from reading the patch it wasn't obvious that at this point


I will add a comment to mention this

> sgs->group_type == busiest->group_type, and I wondered if

> busiest->avg_load below was pointing to garbage, it isn't.

>

> > -     if (sgs->avg_load <= busiest->avg_load)

> > +     /* Select the overloaded group with highest avg_load */

> > +     if (sgs->group_type == group_overloaded &&

> > +         sgs->avg_load <= busiest->avg_load)

> > +             return false;

> > +

> > +     /* Prefer to move from lowest priority CPU's work */

> > +     if (sgs->group_type == group_asym_capacity && sds->busiest &&

> > +         sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

> >               return false;

> >

> >       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))

> > -             goto asym_packing;

> > +             goto spare_capacity;

> >

> >       /*

> >        * Candidate sg has no more than one task per CPU and

>

> Can we do a switch (sds->group_type) here? it seems to have most of them

> listed.


I will have a look but I'm afraid that the
 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
prevents us to get something readbale
Vincent Guittot July 19, 2019, 1:57 p.m. UTC | #8
On Fri, 19 Jul 2019 at 15:06, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

> > @@ -7887,7 +7908,7 @@ static inline int sg_imbalanced(struct sched_group *group)

> >  static inline bool

> >  group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)

> >  {

> > -     if (sgs->sum_h_nr_running < sgs->group_weight)

> > +     if (sgs->sum_nr_running < sgs->group_weight)

> >               return true;

> >

> >       if ((sgs->group_capacity * 100) >

> > @@ -7908,7 +7929,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)

> >  static inline bool

> >  group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)

> >  {

> > -     if (sgs->sum_h_nr_running <= sgs->group_weight)

> > +     if (sgs->sum_nr_running <= sgs->group_weight)

> >               return false;

> >

> >       if ((sgs->group_capacity * 100) <

>

> I suspect this is a change you can pull out into a separate patch after

> the big change. Yes it makes sense to account the other class' task

> presence, but I don't think it is strictly required to be in this patch.


yes
Vincent Guittot July 19, 2019, 2:02 p.m. UTC | #9
On Fri, 19 Jul 2019 at 14:54, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

>

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index 67f0acd..472959df 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -5376,18 +5376,6 @@ static unsigned long capacity_of(int cpu)

> >       return cpu_rq(cpu)->cpu_capacity;

> >  }

> >

> > -static unsigned long cpu_avg_load_per_task(int cpu)

> > -{

> > -     struct rq *rq = cpu_rq(cpu);

> > -     unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);

> > -     unsigned long load_avg = cpu_runnable_load(rq);

> > -

> > -     if (nr_running)

> > -             return load_avg / nr_running;

> > -

> > -     return 0;

> > -}

> > -

> >  static void record_wakee(struct task_struct *p)

> >  {

> >       /*

>

> > @@ -7646,7 +7669,6 @@ static unsigned long task_h_load(struct task_struct *p)

> >  struct sg_lb_stats {

> >       unsigned long avg_load; /*Avg load across the CPUs of the group */

> >       unsigned long group_load; /* Total load over the CPUs of the group */

> > -     unsigned long load_per_task;

> >       unsigned long group_capacity;

> >       unsigned long group_util; /* Total utilization of the group */

> >       unsigned int sum_nr_running; /* Nr tasks running in the group */

>

>

> > @@ -8266,76 +8293,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

> >  }

> >

> >  /**

> > - * fix_small_imbalance - Calculate the minor imbalance that exists

> > - *                   amongst the groups of a sched_domain, during

> > - *                   load balancing.

> > - * @env: The load balancing environment.

> > - * @sds: Statistics of the sched_domain whose imbalance is to be calculated.

> > - */

> > -static inline

> > -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

> > -{

> > -     unsigned long tmp, capa_now = 0, capa_move = 0;

> > -     unsigned int imbn = 2;

> > -     unsigned long scaled_busy_load_per_task;

> > -     struct sg_lb_stats *local, *busiest;

> > -

> > -     local = &sds->local_stat;

> > -     busiest = &sds->busiest_stat;

> > -

> > -     if (!local->sum_h_nr_running)

> > -             local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

> > -     else if (busiest->load_per_task > local->load_per_task)

> > -             imbn = 1;

> > -

> > -     scaled_busy_load_per_task =

> > -             (busiest->load_per_task * SCHED_CAPACITY_SCALE) /

> > -             busiest->group_capacity;

> > -

> > -     if (busiest->avg_load + scaled_busy_load_per_task >=

> > -         local->avg_load + (scaled_busy_load_per_task * imbn)) {

> > -             env->imbalance = busiest->load_per_task;

> > -             return;

> > -     }

> > -

> > -     /*

> > -      * OK, we don't have enough imbalance to justify moving tasks,

> > -      * however we may be able to increase total CPU capacity used by

> > -      * moving them.

> > -      */

> > -

> > -     capa_now += busiest->group_capacity *

> > -                     min(busiest->load_per_task, busiest->avg_load);

> > -     capa_now += local->group_capacity *

> > -                     min(local->load_per_task, local->avg_load);

> > -     capa_now /= SCHED_CAPACITY_SCALE;

> > -

> > -     /* Amount of load we'd subtract */

> > -     if (busiest->avg_load > scaled_busy_load_per_task) {

> > -             capa_move += busiest->group_capacity *

> > -                         min(busiest->load_per_task,

> > -                             busiest->avg_load - scaled_busy_load_per_task);

> > -     }

> > -

> > -     /* Amount of load we'd add */

> > -     if (busiest->avg_load * busiest->group_capacity <

> > -         busiest->load_per_task * SCHED_CAPACITY_SCALE) {

> > -             tmp = (busiest->avg_load * busiest->group_capacity) /

> > -                   local->group_capacity;

> > -     } else {

> > -             tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /

> > -                   local->group_capacity;

> > -     }

> > -     capa_move += local->group_capacity *

> > -                 min(local->load_per_task, local->avg_load + tmp);

> > -     capa_move /= SCHED_CAPACITY_SCALE;

> > -

> > -     /* Move if we gain throughput */

> > -     if (capa_move > capa_now)

> > -             env->imbalance = busiest->load_per_task;

> > -}

> > -

> > -/**

> >   * calculate_imbalance - Calculate the amount of imbalance present within the

> >   *                    groups of a given sched_domain during load balance.

> >   * @env: load balance environment

>

> Maybe strip this out first, in a separate patch. It's all magic doo-doo.


If I remove that 1st, we will have commit in the branch that might
regress some performance temporarily and bisect might spot it while
looking for a culprit, isn't it ?
Vincent Guittot July 19, 2019, 2:13 p.m. UTC | #10
On Fri, 19 Jul 2019 at 15:12, Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:

>

> > @@ -8029,17 +8063,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,

> >               }

> >       }

> >

> > -     /* Adjust by relative CPU capacity of the group */

> > -     sgs->group_capacity = group->sgc->capacity;

> > -     sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

> > +     /* Check if dst cpu is idle and preferred to this group */

> > +     if (env->sd->flags & SD_ASYM_PACKING &&

> > +         env->idle != CPU_NOT_IDLE &&

> > +         sgs->sum_h_nr_running &&

> > +         sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {

> > +             sgs->group_asym_capacity = 1;

> > +     }

> >

> > -     if (sgs->sum_h_nr_running)

> > -             sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> > +     sgs->group_capacity = group->sgc->capacity;

> >

> >       sgs->group_weight = group->group_weight;

> >

> > -     sgs->group_no_capacity = group_is_overloaded(env, sgs);

> > -     sgs->group_type = group_classify(group, sgs);

> > +     sgs->group_type = group_classify(env, group, sgs);

> > +

> > +     /* Computing avg_load makes sense only when group is overloaded */

> > +     if (sgs->group_type != group_overloaded)

>

> The comment seems to suggest you meant: ==


yes looks like you're right :-(

>

> > +             sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> > +                             sgs->group_capacity;

> >  }

> >

> >  /**

> > @@ -8070,7 +8111,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,

> >        */

> >       if (sgs->group_type == group_misfit_task &&

> >           (!group_smaller_max_cpu_capacity(sg, sds->local) ||

> > -          !group_has_capacity(env, &sds->local_stat)))

> > +          sds->local_stat.group_type != group_has_spare))

> >               return false;

> >

> >       if (sgs->group_type > busiest->group_type)

> > @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

> >       if (sgs->group_type < busiest->group_type)

> >               return false;

> >

> > -     if (sgs->avg_load <= busiest->avg_load)

> > +     /* Select the overloaded group with highest avg_load */

> > +     if (sgs->group_type == group_overloaded &&

> > +         sgs->avg_load <= busiest->avg_load)

>

> And this code does too; because with the above '!=', you're comparing

> uninitialized data here, no?


avg_load is always 0
and the load_balance was quite conservative when system was overloaded

>

> > +             return false;

> > +

> > +     /* Prefer to move from lowest priority CPU's work */

> > +     if (sgs->group_type == group_asym_capacity && sds->busiest &&

> > +         sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

> >               return false;

> >

> >       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))

> > -             goto asym_packing;

> > +             goto spare_capacity;

> >

> >       /*

> >        * Candidate sg has no more than one task per CPU and
Peter Zijlstra July 20, 2019, 11:31 a.m. UTC | #11
On Fri, Jul 19, 2019 at 04:02:15PM +0200, Vincent Guittot wrote:
> On Fri, 19 Jul 2019 at 14:54, Peter Zijlstra <peterz@infradead.org> wrote:

> >

> > On Fri, Jul 19, 2019 at 09:58:23AM +0200, Vincent Guittot wrote:


> > > -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)


> > Maybe strip this out first, in a separate patch. It's all magic doo-doo.

> 

> If I remove that 1st, we will have commit in the branch that might

> regress some performance temporarily and bisect might spot it while

> looking for a culprit, isn't it ?


Maybe; but at least for review it is much better to not have it all
squashed in one patch.
Valentin Schneider July 25, 2019, 5:17 p.m. UTC | #12
Hi Vincent,

first batch of questions/comments here...

On 19/07/2019 08:58, Vincent Guittot wrote:
[...]
>  kernel/sched/fair.c | 539 ++++++++++++++++++++++++++++------------------------

>  1 file changed, 289 insertions(+), 250 deletions(-)

> 

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 67f0acd..472959df 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -3771,7 +3771,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)

>  		return;

>  	}

>  

> -	rq->misfit_task_load = task_h_load(p);

> +	rq->misfit_task_load = task_util_est(p);


Huh, interesting. Why go for utilization?

Right now we store the load of the task and use it to pick the "biggest"
misfit (in terms of load) when there are more than one misfit tasks to
choose:

update_sd_pick_busiest():
,----
| /*
|  * If we have more than one misfit sg go with the biggest misfit.
|  */
| if (sgs->group_type == group_misfit_task &&
|     sgs->group_misfit_task_load < busiest->group_misfit_task_load)
| 	return false;
`----

I don't think it makes much sense to maximize utilization for misfit tasks:
they're over the capacity margin, which exactly means "I can't really tell
you much on that utilization other than it doesn't fit".

At the very least, this rq field should be renamed "misfit_task_util".

[...]

> @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

>  enum fbq_type { regular, remote, all };

>  

>  enum group_type {

> -	group_other = 0,

> +	group_has_spare = 0,

> +	group_fully_busy,

>  	group_misfit_task,

> +	group_asym_capacity,

>  	group_imbalanced,

>  	group_overloaded,

>  };

>  

> +enum group_migration {

> +	migrate_task = 0,

> +	migrate_util,

> +	migrate_load,

> +	migrate_misfit,


Can't we have only 3 imbalance types (task, util, load), and make misfit
fall in that first one? Arguably it is a special kind of task balance,
since it would go straight for the active balance, but it would fit a
`migrate_task` imbalance with a "go straight for active balance" flag
somewhere.

[...]

> @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

>  		if (!can_migrate_task(p, env))

>  			goto next;

>  

> -		load = task_h_load(p);

> +		if (env->src_grp_type == migrate_load) {

> +			unsigned long load = task_h_load(p);

>  

> -		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> -			goto next;

> +			if (sched_feat(LB_MIN) &&

> +			    load < 16 && !env->sd->nr_balance_failed)

> +				goto next;

> +

> +			if ((load / 2) > env->imbalance)

> +				goto next;

> +

> +			env->imbalance -= load;

> +		} else	if (env->src_grp_type == migrate_util) {

> +			unsigned long util = task_util_est(p);

> +

> +			if (util > env->imbalance)

> +				goto next;

> +

> +			env->imbalance -= util;

> +		} else if (env->src_grp_type == migrate_misfit) {

> +			unsigned long util = task_util_est(p);

> +

> +			/*

> +			 * utilization of misfit task might decrease a bit

> +			 * since it has been recorded. Be conservative in the

> +			 * condition.

> +			 */

> +			if (2*util < env->imbalance)

> +				goto next;


Other than I'm not convinced we should track utilization for misfit tasks,
can the utilization of the task really be halved between the last
update_misfit_status() and here?

> +

> +			env->imbalance = 0;

> +		} else {

> +			/* Migrate task */

> +			env->imbalance--;

> +		}

>  

> -		if ((load / 2) > env->imbalance)

> -			goto next;

>  

>  		detach_task(p, env);

>  		list_add(&p->se.group_node, &env->tasks);

>  

>  		detached++;

> -		env->imbalance -= load;

>  

>  #ifdef CONFIG_PREEMPT

>  		/*


[...]

> @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

>  	if (busiest->group_type == group_imbalanced) {

>  		/*

>  		 * In the group_imb case we cannot rely on group-wide averages

> -		 * to ensure CPU-load equilibrium, look at wider averages. XXX

> +		 * to ensure CPU-load equilibrium, try to move any task to fix

> +		 * the imbalance. The next load balance will take care of

> +		 * balancing back the system.

>  		 */

> -		busiest->load_per_task =

> -			min(busiest->load_per_task, sds->avg_load);

> +		env->src_grp_type = migrate_task;

> +		env->imbalance = 1;

> +		return;

>  	}

>  

> -	/*

> -	 * Avg load of busiest sg can be less and avg load of local sg can

> -	 * be greater than avg load across all sgs of sd because avg load

> -	 * factors in sg capacity and sgs with smaller group_type are

> -	 * skipped when updating the busiest sg:

> -	 */

> -	if (busiest->group_type != group_misfit_task &&

> -	    (busiest->avg_load <= sds->avg_load ||

> -	     local->avg_load >= sds->avg_load)) {

> -		env->imbalance = 0;

> -		return fix_small_imbalance(env, sds);

> +	if (busiest->group_type == group_misfit_task) {

> +		/* Set imbalance to allow misfit task to be balanced. */

> +		env->src_grp_type = migrate_misfit;

> +		env->imbalance = busiest->group_misfit_task_load;

> +		return;

>  	}

>  

>  	/*

> -	 * If there aren't any idle CPUs, avoid creating some.

> +	 * Try to use spare capacity of local group without overloading it or

> +	 * emptying busiest

>  	 */

> -	if (busiest->group_type == group_overloaded &&

> -	    local->group_type   == group_overloaded) {

> -		load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;

> -		if (load_above_capacity > busiest->group_capacity) {

> -			load_above_capacity -= busiest->group_capacity;

> -			load_above_capacity *= scale_load_down(NICE_0_LOAD);

> -			load_above_capacity /= busiest->group_capacity;

> -		} else

> -			load_above_capacity = ~0UL;

> +	if (local->group_type == group_has_spare) {

> +		long imbalance;

> +

> +		/*

> +		 * If there is no overload, we just want to even the number of

> +		 * idle cpus.

> +		 */

> +		env->src_grp_type = migrate_task;

> +		imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

> +

> +		if (sds->prefer_sibling)

> +			/*

> +			 * When prefer sibling, evenly spread running tasks on

> +			 * groups.

> +			 */

> +			imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1;

> +

> +		if (busiest->group_type > group_fully_busy) {

> +			/*

> +			 * If busiest is overloaded, try to fill spare

> +			 * capacity. This might end up creating spare capacity

> +			 * in busiest or busiest still being overloaded but

> +			 * there is no simple way to directly compute the

> +			 * amount of load to migrate in order to balance the

> +			 * system.

> +			 */

> +			env->src_grp_type = migrate_util;

> +			imbalance = max(local->group_capacity, local->group_util) -

> +				    local->group_util;


Rather than filling the local group, shouldn't we follow the same strategy
as for load, IOW try to reach an average without pushing local above nor
busiest below ?

We could build an sds->avg_util similar to sds->avg_load.

> +		}

> +

> +		env->imbalance = imbalance;

> +		return;

>  	}

[...]
> +/*

> + * Decision matrix according to the local and busiest group state

> + *

> + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded

> + * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced

> + * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced

> + * misfit_task      force     N/A        N/A    N/A  force      force

> + * asym_capacity    force     force      N/A    N/A  force      force

> + * imbalanced       force     force      N/A    N/A  force      force

> + * overloaded       force     force      N/A    N/A  force      avg_load

> + *

> + * N/A :      Not Applicable because already filtered while updating

> + *            statistics.

> + * balanced : The system is balanced for these 2 groups.

> + * force :    Calculate the imbalance as load migration is probably needed.

> + * avg_load : Only if imbalance is significant enough.

> + * nr_idle :  dst_cpu is not busy and the number of idle cpus is quite

> + *            different in groups.

> + */

> +


Nice!
Vincent Guittot July 26, 2019, 9:01 a.m. UTC | #13
On Thu, 25 Jul 2019 at 19:17, Valentin Schneider
<valentin.schneider@arm.com> wrote:
>

> Hi Vincent,

>

> first batch of questions/comments here...

>

> On 19/07/2019 08:58, Vincent Guittot wrote:

> [...]

> >  kernel/sched/fair.c | 539 ++++++++++++++++++++++++++++------------------------

> >  1 file changed, 289 insertions(+), 250 deletions(-)

> >

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index 67f0acd..472959df 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -3771,7 +3771,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)

> >               return;

> >       }

> >

> > -     rq->misfit_task_load = task_h_load(p);

> > +     rq->misfit_task_load = task_util_est(p);

>

> Huh, interesting. Why go for utilization?


Mainly because that's what is used to detect a misfit task and not the load

>

> Right now we store the load of the task and use it to pick the "biggest"

> misfit (in terms of load) when there are more than one misfit tasks to

> choose:


But having a big load doesn't mean that you have a big utilization

so you can trig the misfit case because of task A with a big
utilization that doesn't fit to its local cpu. But then select a task
B in detach_tasks that has a small utilization but a big weight and as
a result a higher load
And task B will never trig the misfit UC by itself and should not
steal the pulling opportunity of task A

>

> update_sd_pick_busiest():

> ,----

> | /*

> |  * If we have more than one misfit sg go with the biggest misfit.

> |  */

> | if (sgs->group_type == group_misfit_task &&

> |     sgs->group_misfit_task_load < busiest->group_misfit_task_load)

> |       return false;

> `----

>

> I don't think it makes much sense to maximize utilization for misfit tasks:

> they're over the capacity margin, which exactly means "I can't really tell

> you much on that utilization other than it doesn't fit".

>

> At the very least, this rq field should be renamed "misfit_task_util".


yes. I agree that i should rename the field

>

> [...]

>

> > @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

> >  enum fbq_type { regular, remote, all };

> >

> >  enum group_type {

> > -     group_other = 0,

> > +     group_has_spare = 0,

> > +     group_fully_busy,

> >       group_misfit_task,

> > +     group_asym_capacity,

> >       group_imbalanced,

> >       group_overloaded,

> >  };

> >

> > +enum group_migration {

> > +     migrate_task = 0,

> > +     migrate_util,

> > +     migrate_load,

> > +     migrate_misfit,

>

> Can't we have only 3 imbalance types (task, util, load), and make misfit

> fall in that first one? Arguably it is a special kind of task balance,

> since it would go straight for the active balance, but it would fit a

> `migrate_task` imbalance with a "go straight for active balance" flag

> somewhere.


migrate_misfit uses its own special condition to detect the task that
can be pulled compared to the other ones

>

> [...]

>

> > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

> >               if (!can_migrate_task(p, env))

> >                       goto next;

> >

> > -             load = task_h_load(p);

> > +             if (env->src_grp_type == migrate_load) {

> > +                     unsigned long load = task_h_load(p);

> >

> > -             if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> > -                     goto next;

> > +                     if (sched_feat(LB_MIN) &&

> > +                         load < 16 && !env->sd->nr_balance_failed)

> > +                             goto next;

> > +

> > +                     if ((load / 2) > env->imbalance)

> > +                             goto next;

> > +

> > +                     env->imbalance -= load;

> > +             } else  if (env->src_grp_type == migrate_util) {

> > +                     unsigned long util = task_util_est(p);

> > +

> > +                     if (util > env->imbalance)

> > +                             goto next;

> > +

> > +                     env->imbalance -= util;

> > +             } else if (env->src_grp_type == migrate_misfit) {

> > +                     unsigned long util = task_util_est(p);

> > +

> > +                     /*

> > +                      * utilization of misfit task might decrease a bit

> > +                      * since it has been recorded. Be conservative in the

> > +                      * condition.

> > +                      */

> > +                     if (2*util < env->imbalance)

> > +                             goto next;

>

> Other than I'm not convinced we should track utilization for misfit tasks,

> can the utilization of the task really be halved between the last

> update_misfit_status() and here?


No it's not but this margin is quite simple to compute and should
cover all cases:
It's high enough to filter others non misfit tasks and small enough to
cope with a decrease of the utilization of the misfit task since we
detected the UC

>

> > +

> > +                     env->imbalance = 0;

> > +             } else {

> > +                     /* Migrate task */

> > +                     env->imbalance--;

> > +             }

> >

> > -             if ((load / 2) > env->imbalance)

> > -                     goto next;

> >

> >               detach_task(p, env);

> >               list_add(&p->se.group_node, &env->tasks);

> >

> >               detached++;

> > -             env->imbalance -= load;

> >

> >  #ifdef CONFIG_PREEMPT

> >               /*

>

> [...]

>

> > @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

> >       if (busiest->group_type == group_imbalanced) {

> >               /*

> >                * In the group_imb case we cannot rely on group-wide averages

> > -              * to ensure CPU-load equilibrium, look at wider averages. XXX

> > +              * to ensure CPU-load equilibrium, try to move any task to fix

> > +              * the imbalance. The next load balance will take care of

> > +              * balancing back the system.

> >                */

> > -             busiest->load_per_task =

> > -                     min(busiest->load_per_task, sds->avg_load);

> > +             env->src_grp_type = migrate_task;

> > +             env->imbalance = 1;

> > +             return;

> >       }

> >

> > -     /*

> > -      * Avg load of busiest sg can be less and avg load of local sg can

> > -      * be greater than avg load across all sgs of sd because avg load

> > -      * factors in sg capacity and sgs with smaller group_type are

> > -      * skipped when updating the busiest sg:

> > -      */

> > -     if (busiest->group_type != group_misfit_task &&

> > -         (busiest->avg_load <= sds->avg_load ||

> > -          local->avg_load >= sds->avg_load)) {

> > -             env->imbalance = 0;

> > -             return fix_small_imbalance(env, sds);

> > +     if (busiest->group_type == group_misfit_task) {

> > +             /* Set imbalance to allow misfit task to be balanced. */

> > +             env->src_grp_type = migrate_misfit;

> > +             env->imbalance = busiest->group_misfit_task_load;

> > +             return;

> >       }

> >

> >       /*

> > -      * If there aren't any idle CPUs, avoid creating some.

> > +      * Try to use spare capacity of local group without overloading it or

> > +      * emptying busiest

> >        */

> > -     if (busiest->group_type == group_overloaded &&

> > -         local->group_type   == group_overloaded) {

> > -             load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;

> > -             if (load_above_capacity > busiest->group_capacity) {

> > -                     load_above_capacity -= busiest->group_capacity;

> > -                     load_above_capacity *= scale_load_down(NICE_0_LOAD);

> > -                     load_above_capacity /= busiest->group_capacity;

> > -             } else

> > -                     load_above_capacity = ~0UL;

> > +     if (local->group_type == group_has_spare) {

> > +             long imbalance;

> > +

> > +             /*

> > +              * If there is no overload, we just want to even the number of

> > +              * idle cpus.

> > +              */

> > +             env->src_grp_type = migrate_task;

> > +             imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

> > +

> > +             if (sds->prefer_sibling)

> > +                     /*

> > +                      * When prefer sibling, evenly spread running tasks on

> > +                      * groups.

> > +                      */

> > +                     imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1;

> > +

> > +             if (busiest->group_type > group_fully_busy) {

> > +                     /*

> > +                      * If busiest is overloaded, try to fill spare

> > +                      * capacity. This might end up creating spare capacity

> > +                      * in busiest or busiest still being overloaded but

> > +                      * there is no simple way to directly compute the

> > +                      * amount of load to migrate in order to balance the

> > +                      * system.

> > +                      */

> > +                     env->src_grp_type = migrate_util;

> > +                     imbalance = max(local->group_capacity, local->group_util) -

> > +                                 local->group_util;

>

> Rather than filling the local group, shouldn't we follow the same strategy

> as for load, IOW try to reach an average without pushing local above nor

> busiest below ?


But we don't know if this will be enough to make the busiest group not
overloaded anymore

This is a transient state:
a group is overloaded, another one has spare capacity
How to balance the system will depend of how much overload if in the
group and we don't know this value.
The only solution is to:
- try to pull as much task as possible to fill the spare capacity
- Is the group still overloaded ? use avg_load to balance the system
because both group will be overloaded
- Is the group no more overloaded ? balance the number of idle cpus

>

> We could build an sds->avg_util similar to sds->avg_load.


When there is spare capacity, we balances the number of idle cpus

>

> > +             }

> > +

> > +             env->imbalance = imbalance;

> > +             return;

> >       }

> [...]

> > +/*

> > + * Decision matrix according to the local and busiest group state

> > + *

> > + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded

> > + * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced

> > + * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced

> > + * misfit_task      force     N/A        N/A    N/A  force      force

> > + * asym_capacity    force     force      N/A    N/A  force      force

> > + * imbalanced       force     force      N/A    N/A  force      force

> > + * overloaded       force     force      N/A    N/A  force      avg_load

> > + *

> > + * N/A :      Not Applicable because already filtered while updating

> > + *            statistics.

> > + * balanced : The system is balanced for these 2 groups.

> > + * force :    Calculate the imbalance as load migration is probably needed.

> > + * avg_load : Only if imbalance is significant enough.

> > + * nr_idle :  dst_cpu is not busy and the number of idle cpus is quite

> > + *            different in groups.

> > + */

> > +

>

> Nice!
Valentin Schneider July 26, 2019, 10:41 a.m. UTC | #14
On 26/07/2019 10:01, Vincent Guittot wrote:
>> Huh, interesting. Why go for utilization?

> 

> Mainly because that's what is used to detect a misfit task and not the load

> 

>>

>> Right now we store the load of the task and use it to pick the "biggest"

>> misfit (in terms of load) when there are more than one misfit tasks to

>> choose:

> 

> But having a big load doesn't mean that you have a big utilization

> 

> so you can trig the misfit case because of task A with a big

> utilization that doesn't fit to its local cpu. But then select a task

> B in detach_tasks that has a small utilization but a big weight and as

> a result a higher load

> And task B will never trig the misfit UC by itself and should not

> steal the pulling opportunity of task A

> 


We can avoid this entirely by going straight for an active balance when
we are balancing misfit tasks (which we really should be doing TBH).

If we *really* want to be surgical about misfit migration, we could track
the task itself via a pointer to its task_struct, but IIRC Morten
purposely avoided this due to all the fun synchronization issues that
come with it.

With that out of the way, I still believe we should maximize the migrated
load when dealing with several misfit tasks - there's not much else you can
look at anyway to make a decision.

It sort of makes sense when e.g. you have two misfit tasks stuck on LITTLE
CPUs and you finally have a big CPU being freed, it would seem fair to pick
the one that's been "throttled" the longest - at equal niceness, that would
be the one with the highest load.

>>

>> update_sd_pick_busiest():

>> ,----

>> | /*

>> |  * If we have more than one misfit sg go with the biggest misfit.

>> |  */

>> | if (sgs->group_type == group_misfit_task &&

>> |     sgs->group_misfit_task_load < busiest->group_misfit_task_load)

>> |       return false;

>> `----

>>

>> I don't think it makes much sense to maximize utilization for misfit tasks:

>> they're over the capacity margin, which exactly means "I can't really tell

>> you much on that utilization other than it doesn't fit".

>>

>> At the very least, this rq field should be renamed "misfit_task_util".

> 

> yes. I agree that i should rename the field

> 

>>

>> [...]

>>

>>> @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

>>>  enum fbq_type { regular, remote, all };

>>>

>>>  enum group_type {

>>> -     group_other = 0,

>>> +     group_has_spare = 0,

>>> +     group_fully_busy,

>>>       group_misfit_task,

>>> +     group_asym_capacity,

>>>       group_imbalanced,

>>>       group_overloaded,

>>>  };

>>>

>>> +enum group_migration {

>>> +     migrate_task = 0,

>>> +     migrate_util,

>>> +     migrate_load,

>>> +     migrate_misfit,

>>

>> Can't we have only 3 imbalance types (task, util, load), and make misfit

>> fall in that first one? Arguably it is a special kind of task balance,

>> since it would go straight for the active balance, but it would fit a

>> `migrate_task` imbalance with a "go straight for active balance" flag

>> somewhere.

> 

> migrate_misfit uses its own special condition to detect the task that

> can be pulled compared to the other ones

> 


Since misfit is about migrating running tasks, a `migrate_task` imbalance
with a flag that goes straight to active balancing should work, no?

[...]
>> Rather than filling the local group, shouldn't we follow the same strategy

>> as for load, IOW try to reach an average without pushing local above nor

>> busiest below ?

> 

> But we don't know if this will be enough to make the busiest group not

> overloaded anymore

> 

> This is a transient state:

> a group is overloaded, another one has spare capacity

> How to balance the system will depend of how much overload if in the

> group and we don't know this value.

> The only solution is to:

> - try to pull as much task as possible to fill the spare capacity

> - Is the group still overloaded ? use avg_load to balance the system

> because both group will be overloaded

> - Is the group no more overloaded ? balance the number of idle cpus

> 

>>

>> We could build an sds->avg_util similar to sds->avg_load.

> 

> When there is spare capacity, we balances the number of idle cpus

> 


What if there is spare capacity but no idle CPUs? In scenarios like this
we should balance utilization. We could wait for a newidle balance to
happen, but it'd be a shame to repeatedly do this when we could
preemptively balance utilization.
Vincent Guittot July 26, 2019, 12:30 p.m. UTC | #15
On Fri, 26 Jul 2019 at 12:41, Valentin Schneider
<valentin.schneider@arm.com> wrote:
>

> On 26/07/2019 10:01, Vincent Guittot wrote:

> >> Huh, interesting. Why go for utilization?

> >

> > Mainly because that's what is used to detect a misfit task and not the load

> >

> >>

> >> Right now we store the load of the task and use it to pick the "biggest"

> >> misfit (in terms of load) when there are more than one misfit tasks to

> >> choose:

> >

> > But having a big load doesn't mean that you have a big utilization

> >

> > so you can trig the misfit case because of task A with a big

> > utilization that doesn't fit to its local cpu. But then select a task

> > B in detach_tasks that has a small utilization but a big weight and as

> > a result a higher load

> > And task B will never trig the misfit UC by itself and should not

> > steal the pulling opportunity of task A

> >

>

> We can avoid this entirely by going straight for an active balance when

> we are balancing misfit tasks (which we really should be doing TBH).


but your misfit task might not be the running one anymore when
load_balance effectively happens

>

> If we *really* want to be surgical about misfit migration, we could track

> the task itself via a pointer to its task_struct, but IIRC Morten


I thought about this but task can have already die at that time and
the pointer is no more relevant.
Or we should parse the list of task still attached to the cpu and
compare them with the saved pointer but then it's not scalable and
will consume a lot of time

> purposely avoided this due to all the fun synchronization issues that

> come with it.

>

> With that out of the way, I still believe we should maximize the migrated

> load when dealing with several misfit tasks - there's not much else you can

> look at anyway to make a decision.


But you can easily select a task that is not misfit so what is the best/worst ?
select a fully wrong task or at least one of the real misfit tasks

I'm fine to go back and use load instead of util but it's not robust IMO.

>

> It sort of makes sense when e.g. you have two misfit tasks stuck on LITTLE

> CPUs and you finally have a big CPU being freed, it would seem fair to pick

> the one that's been "throttled" the longest - at equal niceness, that would

> be the one with the highest load.

>

> >>

> >> update_sd_pick_busiest():

> >> ,----

> >> | /*

> >> |  * If we have more than one misfit sg go with the biggest misfit.

> >> |  */

> >> | if (sgs->group_type == group_misfit_task &&

> >> |     sgs->group_misfit_task_load < busiest->group_misfit_task_load)

> >> |       return false;

> >> `----

> >>

> >> I don't think it makes much sense to maximize utilization for misfit tasks:

> >> they're over the capacity margin, which exactly means "I can't really tell

> >> you much on that utilization other than it doesn't fit".

> >>

> >> At the very least, this rq field should be renamed "misfit_task_util".

> >

> > yes. I agree that i should rename the field

> >

> >>

> >> [...]

> >>

> >>> @@ -7060,12 +7048,21 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;

> >>>  enum fbq_type { regular, remote, all };

> >>>

> >>>  enum group_type {

> >>> -     group_other = 0,

> >>> +     group_has_spare = 0,

> >>> +     group_fully_busy,

> >>>       group_misfit_task,

> >>> +     group_asym_capacity,

> >>>       group_imbalanced,

> >>>       group_overloaded,

> >>>  };

> >>>

> >>> +enum group_migration {

> >>> +     migrate_task = 0,

> >>> +     migrate_util,

> >>> +     migrate_load,

> >>> +     migrate_misfit,

> >>

> >> Can't we have only 3 imbalance types (task, util, load), and make misfit

> >> fall in that first one? Arguably it is a special kind of task balance,

> >> since it would go straight for the active balance, but it would fit a

> >> `migrate_task` imbalance with a "go straight for active balance" flag

> >> somewhere.

> >

> > migrate_misfit uses its own special condition to detect the task that

> > can be pulled compared to the other ones

> >

>

> Since misfit is about migrating running tasks, a `migrate_task` imbalance

> with a flag that goes straight to active balancing should work, no?


see my previous comment

>

> [...]

> >> Rather than filling the local group, shouldn't we follow the same strategy

> >> as for load, IOW try to reach an average without pushing local above nor

> >> busiest below ?

> >

> > But we don't know if this will be enough to make the busiest group not

> > overloaded anymore

> >

> > This is a transient state:

> > a group is overloaded, another one has spare capacity

> > How to balance the system will depend of how much overload if in the

> > group and we don't know this value.

> > The only solution is to:

> > - try to pull as much task as possible to fill the spare capacity

> > - Is the group still overloaded ? use avg_load to balance the system

> > because both group will be overloaded

> > - Is the group no more overloaded ? balance the number of idle cpus

> >

> >>

> >> We could build an sds->avg_util similar to sds->avg_load.

> >

> > When there is spare capacity, we balances the number of idle cpus

> >

>

> What if there is spare capacity but no idle CPUs? In scenarios like this

> we should balance utilization. We could wait for a newidle balance to


why should we balance anything ? all tasks have already enough running time.
It's better to wait for a cpu to become idle instead of trying to
predict which one will be idle first and migrate task uselessly
because other tasks can easily wakeup in the meantime

> happen, but it'd be a shame to repeatedly do this when we could

> preemptively balance utilization.

>
Srikar Dronamraju July 26, 2019, 1:58 p.m. UTC | #16
> 

> The type of sched_group has been extended to better reflect the type of

> imbalance. We now have :

> 	group_has_spare

> 	group_fully_busy

> 	group_misfit_task

> 	group_asym_capacity

> 	group_imbalanced

> 	group_overloaded


How is group_fully_busy different from group_overloaded?

> 

> Based on the type fo sched_group, load_balance now sets what it wants to

> move in order to fix the imnbalance. It can be some load as before but

> also some utilization, a number of task or a type of task:

> 	migrate_task

> 	migrate_util

> 	migrate_load

> 	migrate_misfit


Can we club migrate_util and migrate_misfit?

> @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

>  		if (!can_migrate_task(p, env))

>  			goto next;

>  

> -		load = task_h_load(p);

> +		if (env->src_grp_type == migrate_load) {

> +			unsigned long load = task_h_load(p);

>  

> -		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> -			goto next;

> +			if (sched_feat(LB_MIN) &&

> +			    load < 16 && !env->sd->nr_balance_failed)

> +				goto next;

> +

> +			if ((load / 2) > env->imbalance)

> +				goto next;


I know this existed before too but if the load is exactly or around 2x of
env->imbalance, the resultant imbalance after the load balance operation
would still be around env->imbalance. We may lose some cache affinity too.

Can we do something like.
		if (2 * load > 3 * env->imbalance)
			goto next;

> @@ -7690,14 +7711,14 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

>  	*sds = (struct sd_lb_stats){

>  		.busiest = NULL,

>  		.local = NULL,

> -		.total_running = 0UL,

>  		.total_load = 0UL,

>  		.total_capacity = 0UL,

>  		.busiest_stat = {

>  			.avg_load = 0UL,

>  			.sum_nr_running = 0,

>  			.sum_h_nr_running = 0,

> -			.group_type = group_other,

> +			.idle_cpus = UINT_MAX,


Why are we initializing idle_cpus to UINT_MAX? Shouldnt this be set to 0?
I only see an increment and compare. 

> +			.group_type = group_has_spare,

>  		},

>  	};

>  }

>  

>  static inline enum

> -group_type group_classify(struct sched_group *group,

> +group_type group_classify(struct lb_env *env,

> +			  struct sched_group *group,

>  			  struct sg_lb_stats *sgs)

>  {

> -	if (sgs->group_no_capacity)

> +	if (group_is_overloaded(env, sgs))

>  		return group_overloaded;

>  

>  	if (sg_imbalanced(group))

> @@ -7953,7 +7975,13 @@ group_type group_classify(struct sched_group *group,

>  	if (sgs->group_misfit_task_load)

>  		return group_misfit_task;

>  

> -	return group_other;

> +	if (sgs->group_asym_capacity)

> +		return group_asym_capacity;

> +

> +	if (group_has_capacity(env, sgs))

> +		return group_has_spare;

> +

> +	return group_fully_busy;


If its not overloaded but also doesnt have capacity.
Does it mean its capacity is completely filled.
Cant we consider that as same as overloaded?

>  }

>  

>  

> -	if (sgs->sum_h_nr_running)

> -		sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> +	sgs->group_capacity = group->sgc->capacity;

>  

>  	sgs->group_weight = group->group_weight;

>  

> -	sgs->group_no_capacity = group_is_overloaded(env, sgs);

> -	sgs->group_type = group_classify(group, sgs);

> +	sgs->group_type = group_classify(env, group, sgs);

> +

> +	/* Computing avg_load makes sense only when group is overloaded */

> +	if (sgs->group_type != group_overloaded)

> +		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> +				sgs->group_capacity;


Mismatch in comment and code?

> @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

>  	if (sgs->group_type < busiest->group_type)

>  		return false;

>  

> -	if (sgs->avg_load <= busiest->avg_load)

> +	/* Select the overloaded group with highest avg_load */

> +	if (sgs->group_type == group_overloaded &&

> +	    sgs->avg_load <= busiest->avg_load)

> +		return false;

> +

> +	/* Prefer to move from lowest priority CPU's work */

> +	if (sgs->group_type == group_asym_capacity && sds->busiest &&

> +	    sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

>  		return false;


I thought this should have been 
	/* Prefer to move from lowest priority CPU's work */
	if (sgs->group_type == group_asym_capacity && sds->busiest &&
	    sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
		return true;

> @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

>  	if (busiest->group_type == group_imbalanced) {

>  		/*

>  		 * In the group_imb case we cannot rely on group-wide averages

> -		 * to ensure CPU-load equilibrium, look at wider averages. XXX

> +		 * to ensure CPU-load equilibrium, try to move any task to fix

> +		 * the imbalance. The next load balance will take care of

> +		 * balancing back the system.

>  		 */

> -		busiest->load_per_task =

> -			min(busiest->load_per_task, sds->avg_load);

> +		env->src_grp_type = migrate_task;

> +		env->imbalance = 1;

> +		return;

>  	}

>  

> -	/*

> -	 * Avg load of busiest sg can be less and avg load of local sg can

> -	 * be greater than avg load across all sgs of sd because avg load

> -	 * factors in sg capacity and sgs with smaller group_type are

> -	 * skipped when updating the busiest sg:

> -	 */

> -	if (busiest->group_type != group_misfit_task &&

> -	    (busiest->avg_load <= sds->avg_load ||

> -	     local->avg_load >= sds->avg_load)) {

> -		env->imbalance = 0;

> -		return fix_small_imbalance(env, sds);

> +	if (busiest->group_type == group_misfit_task) {

> +		/* Set imbalance to allow misfit task to be balanced. */

> +		env->src_grp_type = migrate_misfit;

> +		env->imbalance = busiest->group_misfit_task_load;

> +		return;

>  	}

>  

>  	/*

> -	 * If there aren't any idle CPUs, avoid creating some.

> +	 * Try to use spare capacity of local group without overloading it or

> +	 * emptying busiest

>  	 */

> -	if (busiest->group_type == group_overloaded &&

> -	    local->group_type   == group_overloaded) {

> -		load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;

> -		if (load_above_capacity > busiest->group_capacity) {

> -			load_above_capacity -= busiest->group_capacity;

> -			load_above_capacity *= scale_load_down(NICE_0_LOAD);

> -			load_above_capacity /= busiest->group_capacity;

> -		} else

> -			load_above_capacity = ~0UL;

> +	if (local->group_type == group_has_spare) {

> +		long imbalance;

> +

> +		/*

> +		 * If there is no overload, we just want to even the number of

> +		 * idle cpus.

> +		 */

> +		env->src_grp_type = migrate_task;

> +		imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);


Shouldnt this be?
		imbalance = max_t(long, 0, (busiest->idle_cpus - local->idle_cpus) >> 1);
> +

> +		if (sds->prefer_sibling)

> +			/*

> +			 * When prefer sibling, evenly spread running tasks on

> +			 * groups.

> +			 */

> +			imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1;

> +

> +		if (busiest->group_type > group_fully_busy) {

> +			/*

> +			 * If busiest is overloaded, try to fill spare

> +			 * capacity. This might end up creating spare capacity

> +			 * in busiest or busiest still being overloaded but

> +			 * there is no simple way to directly compute the

> +			 * amount of load to migrate in order to balance the

> +			 * system.

> +			 */

> +			env->src_grp_type = migrate_util;

> +			imbalance = max(local->group_capacity, local->group_util) -

> +				    local->group_util;

> +		}

> +

> +		env->imbalance = imbalance;

> +		return;

>  	}

>  

>  	/*

> -	 * We're trying to get all the CPUs to the average_load, so we don't

> -	 * want to push ourselves above the average load, nor do we wish to

> -	 * reduce the max loaded CPU below the average load. At the same time,

> -	 * we also don't want to reduce the group load below the group

> -	 * capacity. Thus we look for the minimum possible imbalance.

> +	 * Local is fully busy but have to take more load to relieve the

> +	 * busiest group

>  	 */

> -	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

> +	if (local->group_type < group_overloaded) {



What action are we taking if we find the local->group_type to be group_imbalanced
or group_misfit ?  We will fall here but I dont know if it right to look for
avg_load in that case.

> +		/*

> +		 * Local will become overvloaded so the avg_load metrics are

> +		 * finally needed

> +		 */

>  

> -	/* How much load to actually move to equalise the imbalance */

> -	env->imbalance = min(

> -		max_pull * busiest->group_capacity,

> -		(sds->avg_load - local->avg_load) * local->group_capacity

> -	) / SCHED_CAPACITY_SCALE;

> +		local->avg_load = (local->group_load*SCHED_CAPACITY_SCALE)

> +						/ local->group_capacity;

>  

> -	/* Boost imbalance to allow misfit task to be balanced. */

> -	if (busiest->group_type == group_misfit_task) {

> -		env->imbalance = max_t(long, env->imbalance,

> -				       busiest->group_misfit_task_load);

> +		sds->avg_load = (SCHED_CAPACITY_SCALE * sds->total_load)

> +						/ sds->total_capacity;

>  	}

>  

>  	/*

> -	 * if *imbalance is less than the average load per runnable task

> -	 * there is no guarantee that any tasks will be moved so we'll have

> -	 * a think about bumping its value to force at least one task to be

> -	 * moved

> +	 * Both group are or will become overloaded and we're trying to get

> +	 * all the CPUs to the average_load, so we don't want to push

> +	 * ourselves above the average load, nor do we wish to reduce the

> +	 * max loaded CPU below the average load. At the same time, we also

> +	 * don't want to reduce the group load below the group capacity.

> +	 * Thus we look for the minimum possible imbalance.

>  	 */

> -	if (env->imbalance < busiest->load_per_task)

> -		return fix_small_imbalance(env, sds);

> +	env->src_grp_type = migrate_load;

> +	env->imbalance = min(

> +		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,

> +		(sds->avg_load - local->avg_load) * local->group_capacity

> +	) / SCHED_CAPACITY_SCALE;

>  }


We calculated avg_load for !group_overloaded case, but seem to be using for
group_overloaded cases too.


-- 
Thanks and Regards
Srikar Dronamraju
Valentin Schneider July 26, 2019, 2:01 p.m. UTC | #17
On 26/07/2019 13:30, Vincent Guittot wrote:
>> We can avoid this entirely by going straight for an active balance when

>> we are balancing misfit tasks (which we really should be doing TBH).

> 

> but your misfit task might not be the running one anymore when

> load_balance effectively happens

> 


We could add a check in the active balance bits to make sure the current
task is still a misfit task (albeit not necessarily the one we wanted to
migrate, since we can't really differentiate them).

Misfit migration shouldn't go through detach_tasks() - if the misfit task
is still the running task, we want to go for active balance anyway, and if
it's not the running task anymore then we should try to detect it and give
up - there's not much else we can do. From a rq's perspective, a task can
only ever be misfit if it's currently running.

The current code can totally active balance the wrong task if the load
balancer saw a misfit task in update_sd_lb_stats() but it moved away in the
meantime, so making misfit balancing skip detach_tasks() would be a straight
improvement IMO: we can still get some active balance collaterals, but at
least we don't wrongfully detach a non-running task that happened to have
the right load shape.

>>

>> If we *really* want to be surgical about misfit migration, we could track

>> the task itself via a pointer to its task_struct, but IIRC Morten

> 

> I thought about this but task can have already die at that time and

> the pointer is no more relevant.

> Or we should parse the list of task still attached to the cpu and

> compare them with the saved pointer but then it's not scalable and

> will consume a lot of time

> 

>> purposely avoided this due to all the fun synchronization issues that

>> come with it.

>>

>> With that out of the way, I still believe we should maximize the migrated

>> load when dealing with several misfit tasks - there's not much else you can

>> look at anyway to make a decision.

> 

> But you can easily select a task that is not misfit so what is the best/worst ?

> select a fully wrong task or at least one of the real misfit tasks

> 


Utilization can't help you select a "best" misfit task amongst several 
since the utilization of misfit tasks is by definition meaningless.

I do agree that looking at utilization when detaching the task prevents
picking a non-misfit task, but those are two different issues:

1) Among several rqs/groups with misfit tasks, pick the busiest one
   (this is where I'm arguing we should use load)
2) When detaching a task, make sure it's a misfit task (this is where
   you're arguing we should use utilization).

> I'm fine to go back and use load instead of util but it's not robust IMO.

> 


[...]
>> What if there is spare capacity but no idle CPUs? In scenarios like this

>> we should balance utilization. We could wait for a newidle balance to

> 

> why should we balance anything ? all tasks have already enough running time.

> It's better to wait for a cpu to become idle instead of trying to

> predict which one will be idle first and migrate task uselessly

> because other tasks can easily wakeup in the meantime

> 


I probably need to play with this and create some synthetic use cases.

What I had in mind is something like 2 CPUs, CPU0 running a 20% task and
CPU1 running 6 10% tasks.

If CPU0 runs the load balancer, balancing utilization would mean pulling
2 tasks from CPU1 to reach the domain-average of 40%. The good side of this
is that we could save ourselves from running some newidle balances, but
I'll admit that's all quite "finger in the air".

>> happen, but it'd be a shame to repeatedly do this when we could

>> preemptively balance utilization.

>>
Valentin Schneider July 26, 2019, 2:09 p.m. UTC | #18
On 26/07/2019 14:58, Srikar Dronamraju wrote:
[...]
>> @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

>>  	if (busiest->group_type == group_imbalanced) {

>>  		/*

>>  		 * In the group_imb case we cannot rely on group-wide averages

>> -		 * to ensure CPU-load equilibrium, look at wider averages. XXX

>> +		 * to ensure CPU-load equilibrium, try to move any task to fix

>> +		 * the imbalance. The next load balance will take care of

>> +		 * balancing back the system.

>>  		 */

>> -		busiest->load_per_task =

>> -			min(busiest->load_per_task, sds->avg_load);

>> +		env->src_grp_type = migrate_task;

>> +		env->imbalance = 1;

>> +		return;

>>  	}

>>  

>> -	/*

>> -	 * Avg load of busiest sg can be less and avg load of local sg can

>> -	 * be greater than avg load across all sgs of sd because avg load

>> -	 * factors in sg capacity and sgs with smaller group_type are

>> -	 * skipped when updating the busiest sg:

>> -	 */

>> -	if (busiest->group_type != group_misfit_task &&

>> -	    (busiest->avg_load <= sds->avg_load ||

>> -	     local->avg_load >= sds->avg_load)) {

>> -		env->imbalance = 0;

>> -		return fix_small_imbalance(env, sds);

>> +	if (busiest->group_type == group_misfit_task) {

>> +		/* Set imbalance to allow misfit task to be balanced. */

>> +		env->src_grp_type = migrate_misfit;

>> +		env->imbalance = busiest->group_misfit_task_load;

>> +		return;

>>  	}

>>  

>>  	/*

>> -	 * If there aren't any idle CPUs, avoid creating some.

>> +	 * Try to use spare capacity of local group without overloading it or

>> +	 * emptying busiest

>>  	 */

>> -	if (busiest->group_type == group_overloaded &&

>> -	    local->group_type   == group_overloaded) {

>> -		load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;

>> -		if (load_above_capacity > busiest->group_capacity) {

>> -			load_above_capacity -= busiest->group_capacity;

>> -			load_above_capacity *= scale_load_down(NICE_0_LOAD);

>> -			load_above_capacity /= busiest->group_capacity;

>> -		} else

>> -			load_above_capacity = ~0UL;

>> +	if (local->group_type == group_has_spare) {

>> +		long imbalance;

>> +

>> +		/*

>> +		 * If there is no overload, we just want to even the number of

>> +		 * idle cpus.

>> +		 */

>> +		env->src_grp_type = migrate_task;

>> +		imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

> 

> Shouldnt this be?

> 		imbalance = max_t(long, 0, (busiest->idle_cpus - local->idle_cpus) >> 1);


I think it's the right way around - if busiest has more idle CPUs than local,
then we shouldn't balance (local is busier than busiest)

However, doesn't that lead to a imbalance of 0 when e.g. local has 1 idle
CPU and busiest has 0 ?. If busiest has more than one task we should try
to pull at least one.
Vincent Guittot July 26, 2019, 2:42 p.m. UTC | #19
On Fri, 26 Jul 2019 at 15:59, Srikar Dronamraju
<srikar@linux.vnet.ibm.com> wrote:
>

> >

> > The type of sched_group has been extended to better reflect the type of

> > imbalance. We now have :

> >       group_has_spare

> >       group_fully_busy

> >       group_misfit_task

> >       group_asym_capacity

> >       group_imbalanced

> >       group_overloaded

>

> How is group_fully_busy different from group_overloaded?


group_fully_busy means that tasks have enough compute capacity whereas
group_overloaded means that tasks are competing to use the CPU and
need more compute capacity.
As an example:
a cpu is fully busy with 1 always running task
but a cpu is overloaded with 2 always running tasks or 2 task that
want 75% of the CPU as an example

>

> >

> > Based on the type fo sched_group, load_balance now sets what it wants to

> > move in order to fix the imnbalance. It can be some load as before but

> > also some utilization, a number of task or a type of task:

> >       migrate_task

> >       migrate_util

> >       migrate_load

> >       migrate_misfit

>

> Can we club migrate_util and migrate_misfit?


migrate_misfit want to move 1 task whereas migrate_util want to
migrate an amount of utilization which can lead to migrate several
tasks

>

> > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

> >               if (!can_migrate_task(p, env))

> >                       goto next;

> >

> > -             load = task_h_load(p);

> > +             if (env->src_grp_type == migrate_load) {

> > +                     unsigned long load = task_h_load(p);

> >

> > -             if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> > -                     goto next;

> > +                     if (sched_feat(LB_MIN) &&

> > +                         load < 16 && !env->sd->nr_balance_failed)

> > +                             goto next;

> > +

> > +                     if ((load / 2) > env->imbalance)

> > +                             goto next;

>

> I know this existed before too but if the load is exactly or around 2x of

> env->imbalance, the resultant imbalance after the load balance operation

> would still be around env->imbalance. We may lose some cache affinity too.

>

> Can we do something like.

>                 if (2 * load > 3 * env->imbalance)

>                         goto next;


TBH, I don't know what should be the best value and it's probably
worth doing some investigation but i would prefer to do that as a
separate patch to get a similar behavior in the overloaded case
Why do you propose 3/2 instead of 2 ?

>

> > @@ -7690,14 +7711,14 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

> >       *sds = (struct sd_lb_stats){

> >               .busiest = NULL,

> >               .local = NULL,

> > -             .total_running = 0UL,

> >               .total_load = 0UL,

> >               .total_capacity = 0UL,

> >               .busiest_stat = {

> >                       .avg_load = 0UL,

> >                       .sum_nr_running = 0,

> >                       .sum_h_nr_running = 0,

> > -                     .group_type = group_other,

> > +                     .idle_cpus = UINT_MAX,

>

> Why are we initializing idle_cpus to UINT_MAX? Shouldnt this be set to 0?


This is the default busiest statistics attached to env

> I only see an increment and compare.


In update_sd_pick_busiest(), we look for the group_has_spare_capacity
with lowest number of idle cpus which we expect to be the busiest.
So the 1st group with spare capacity will have for sure less idle_cpus
and will replace the default one

>

> > +                     .group_type = group_has_spare,

> >               },

> >       };

> >  }

> >

> >  static inline enum

> > -group_type group_classify(struct sched_group *group,

> > +group_type group_classify(struct lb_env *env,

> > +                       struct sched_group *group,

> >                         struct sg_lb_stats *sgs)

> >  {

> > -     if (sgs->group_no_capacity)

> > +     if (group_is_overloaded(env, sgs))

> >               return group_overloaded;

> >

> >       if (sg_imbalanced(group))

> > @@ -7953,7 +7975,13 @@ group_type group_classify(struct sched_group *group,

> >       if (sgs->group_misfit_task_load)

> >               return group_misfit_task;

> >

> > -     return group_other;

> > +     if (sgs->group_asym_capacity)

> > +             return group_asym_capacity;

> > +

> > +     if (group_has_capacity(env, sgs))

> > +             return group_has_spare;

> > +

> > +     return group_fully_busy;

>

> If its not overloaded but also doesnt have capacity.

> Does it mean its capacity is completely filled.

> Cant we consider that as same as overloaded?


I have answered to this in the 1st question

>

> >  }

> >

> >

> > -     if (sgs->sum_h_nr_running)

> > -             sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> > +     sgs->group_capacity = group->sgc->capacity;

> >

> >       sgs->group_weight = group->group_weight;

> >

> > -     sgs->group_no_capacity = group_is_overloaded(env, sgs);

> > -     sgs->group_type = group_classify(group, sgs);

> > +     sgs->group_type = group_classify(env, group, sgs);

> > +

> > +     /* Computing avg_load makes sense only when group is overloaded */

> > +     if (sgs->group_type != group_overloaded)

> > +             sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> > +                             sgs->group_capacity;

>

> Mismatch in comment and code?


I may need to add more comments but at this step, the group should be
either overloaded or fully busy but it can also be imbalanced.
In case of a group fully busy or imbalanced (sgs->group_type !=
group_overloaded), we haven't computed avg_load yet so we have to do
so because:
-In the case of fully_busy, we are going to be overloaded which the
next step after fully busy when you are about to pull more load
-In case of imbalance, we don't know the real state of the local group
so we fall back to this default behavior

>

> > @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,

> >       if (sgs->group_type < busiest->group_type)

> >               return false;

> >

> > -     if (sgs->avg_load <= busiest->avg_load)

> > +     /* Select the overloaded group with highest avg_load */

> > +     if (sgs->group_type == group_overloaded &&

> > +         sgs->avg_load <= busiest->avg_load)

> > +             return false;

> > +

> > +     /* Prefer to move from lowest priority CPU's work */

> > +     if (sgs->group_type == group_asym_capacity && sds->busiest &&

> > +         sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

> >               return false;

>

> I thought this should have been

>         /* Prefer to move from lowest priority CPU's work */

>         if (sgs->group_type == group_asym_capacity && sds->busiest &&

>             sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))

>                 return true;


Here we want to select the "busiest" group_asym_capacity which means
the one with the lowest priority
If sg->asym_prefer_cpu is prefered to be used instead of
sds->busiest->asym_prefer_cpu, we should keep busiest as the group to
be emptied and return false to not replace the latter

>

> > @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s

> >       if (busiest->group_type == group_imbalanced) {

> >               /*

> >                * In the group_imb case we cannot rely on group-wide averages

> > -              * to ensure CPU-load equilibrium, look at wider averages. XXX

> > +              * to ensure CPU-load equilibrium, try to move any task to fix

> > +              * the imbalance. The next load balance will take care of

> > +              * balancing back the system.

> >                */

> > -             busiest->load_per_task =

> > -                     min(busiest->load_per_task, sds->avg_load);

> > +             env->src_grp_type = migrate_task;

> > +             env->imbalance = 1;

> > +             return;

> >       }

> >

> > -     /*

> > -      * Avg load of busiest sg can be less and avg load of local sg can

> > -      * be greater than avg load across all sgs of sd because avg load

> > -      * factors in sg capacity and sgs with smaller group_type are

> > -      * skipped when updating the busiest sg:

> > -      */

> > -     if (busiest->group_type != group_misfit_task &&

> > -         (busiest->avg_load <= sds->avg_load ||

> > -          local->avg_load >= sds->avg_load)) {

> > -             env->imbalance = 0;

> > -             return fix_small_imbalance(env, sds);

> > +     if (busiest->group_type == group_misfit_task) {

> > +             /* Set imbalance to allow misfit task to be balanced. */

> > +             env->src_grp_type = migrate_misfit;

> > +             env->imbalance = busiest->group_misfit_task_load;

> > +             return;

> >       }

> >

> >       /*

> > -      * If there aren't any idle CPUs, avoid creating some.

> > +      * Try to use spare capacity of local group without overloading it or

> > +      * emptying busiest

> >        */

> > -     if (busiest->group_type == group_overloaded &&

> > -         local->group_type   == group_overloaded) {

> > -             load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;

> > -             if (load_above_capacity > busiest->group_capacity) {

> > -                     load_above_capacity -= busiest->group_capacity;

> > -                     load_above_capacity *= scale_load_down(NICE_0_LOAD);

> > -                     load_above_capacity /= busiest->group_capacity;

> > -             } else

> > -                     load_above_capacity = ~0UL;

> > +     if (local->group_type == group_has_spare) {

> > +             long imbalance;

> > +

> > +             /*

> > +              * If there is no overload, we just want to even the number of

> > +              * idle cpus.

> > +              */

> > +             env->src_grp_type = migrate_task;

> > +             imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);

>

> Shouldnt this be?

>                 imbalance = max_t(long, 0, (busiest->idle_cpus - local->idle_cpus) >> 1);


local has more idle_cpus than busiest otherwise we don't try to pull task

> > +

> > +             if (sds->prefer_sibling)

> > +                     /*

> > +                      * When prefer sibling, evenly spread running tasks on

> > +                      * groups.

> > +                      */

> > +                     imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1;

> > +

> > +             if (busiest->group_type > group_fully_busy) {

> > +                     /*

> > +                      * If busiest is overloaded, try to fill spare

> > +                      * capacity. This might end up creating spare capacity

> > +                      * in busiest or busiest still being overloaded but

> > +                      * there is no simple way to directly compute the

> > +                      * amount of load to migrate in order to balance the

> > +                      * system.

> > +                      */

> > +                     env->src_grp_type = migrate_util;

> > +                     imbalance = max(local->group_capacity, local->group_util) -

> > +                                 local->group_util;

> > +             }

> > +

> > +             env->imbalance = imbalance;

> > +             return;

> >       }

> >

> >       /*

> > -      * We're trying to get all the CPUs to the average_load, so we don't

> > -      * want to push ourselves above the average load, nor do we wish to

> > -      * reduce the max loaded CPU below the average load. At the same time,

> > -      * we also don't want to reduce the group load below the group

> > -      * capacity. Thus we look for the minimum possible imbalance.

> > +      * Local is fully busy but have to take more load to relieve the

> > +      * busiest group

> >        */

> > -     max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

> > +     if (local->group_type < group_overloaded) {

>

>

> What action are we taking if we find the local->group_type to be group_imbalanced

> or group_misfit ?  We will fall here but I dont know if it right to look for

> avg_load in that case.


local->group_type can't be misfit
For local->group_type is imbalance , I answered in a previous comment

>

> > +             /*

> > +              * Local will become overvloaded so the avg_load metrics are

> > +              * finally needed

> > +              */

> >

> > -     /* How much load to actually move to equalise the imbalance */

> > -     env->imbalance = min(

> > -             max_pull * busiest->group_capacity,

> > -             (sds->avg_load - local->avg_load) * local->group_capacity

> > -     ) / SCHED_CAPACITY_SCALE;

> > +             local->avg_load = (local->group_load*SCHED_CAPACITY_SCALE)

> > +                                             / local->group_capacity;

> >

> > -     /* Boost imbalance to allow misfit task to be balanced. */

> > -     if (busiest->group_type == group_misfit_task) {

> > -             env->imbalance = max_t(long, env->imbalance,

> > -                                    busiest->group_misfit_task_load);

> > +             sds->avg_load = (SCHED_CAPACITY_SCALE * sds->total_load)

> > +                                             / sds->total_capacity;

> >       }

> >

> >       /*

> > -      * if *imbalance is less than the average load per runnable task

> > -      * there is no guarantee that any tasks will be moved so we'll have

> > -      * a think about bumping its value to force at least one task to be

> > -      * moved

> > +      * Both group are or will become overloaded and we're trying to get

> > +      * all the CPUs to the average_load, so we don't want to push

> > +      * ourselves above the average load, nor do we wish to reduce the

> > +      * max loaded CPU below the average load. At the same time, we also

> > +      * don't want to reduce the group load below the group capacity.

> > +      * Thus we look for the minimum possible imbalance.

> >        */

> > -     if (env->imbalance < busiest->load_per_task)

> > -             return fix_small_imbalance(env, sds);

> > +     env->src_grp_type = migrate_load;

> > +     env->imbalance = min(

> > +             (busiest->avg_load - sds->avg_load) * busiest->group_capacity,

> > +             (sds->avg_load - local->avg_load) * local->group_capacity

> > +     ) / SCHED_CAPACITY_SCALE;

> >  }

>

> We calculated avg_load for !group_overloaded case, but seem to be using for

> group_overloaded cases too.


for group_overloaded case, we already computed it in update_sg_lb_stats()

>

>

> --

> Thanks and Regards

> Srikar Dronamraju

>
Vincent Guittot July 26, 2019, 2:47 p.m. UTC | #20
On Fri, 26 Jul 2019 at 16:01, Valentin Schneider
<valentin.schneider@arm.com> wrote:
>

> On 26/07/2019 13:30, Vincent Guittot wrote:

> >> We can avoid this entirely by going straight for an active balance when

> >> we are balancing misfit tasks (which we really should be doing TBH).

> >

> > but your misfit task might not be the running one anymore when

> > load_balance effectively happens

> >

>

> We could add a check in the active balance bits to make sure the current

> task is still a misfit task (albeit not necessarily the one we wanted to

> migrate, since we can't really differentiate them).

>

> Misfit migration shouldn't go through detach_tasks() - if the misfit task

> is still the running task, we want to go for active balance anyway, and if

> it's not the running task anymore then we should try to detect it and give

> up - there's not much else we can do. From a rq's perspective, a task can

> only ever be misfit if it's currently running.

>

> The current code can totally active balance the wrong task if the load

> balancer saw a misfit task in update_sd_lb_stats() but it moved away in the

> meantime, so making misfit balancing skip detach_tasks() would be a straight

> improvement IMO: we can still get some active balance collaterals, but at

> least we don't wrongfully detach a non-running task that happened to have

> the right load shape.

>

> >>

> >> If we *really* want to be surgical about misfit migration, we could track

> >> the task itself via a pointer to its task_struct, but IIRC Morten

> >

> > I thought about this but task can have already die at that time and

> > the pointer is no more relevant.

> > Or we should parse the list of task still attached to the cpu and

> > compare them with the saved pointer but then it's not scalable and

> > will consume a lot of time

> >

> >> purposely avoided this due to all the fun synchronization issues that

> >> come with it.

> >>

> >> With that out of the way, I still believe we should maximize the migrated

> >> load when dealing with several misfit tasks - there's not much else you can

> >> look at anyway to make a decision.

> >

> > But you can easily select a task that is not misfit so what is the best/worst ?

> > select a fully wrong task or at least one of the real misfit tasks

> >

>

> Utilization can't help you select a "best" misfit task amongst several

> since the utilization of misfit tasks is by definition meaningless.

>

> I do agree that looking at utilization when detaching the task prevents

> picking a non-misfit task, but those are two different issues:

>

> 1) Among several rqs/groups with misfit tasks, pick the busiest one

>    (this is where I'm arguing we should use load)

> 2) When detaching a task, make sure it's a misfit task (this is where

>    you're arguing we should use utilization).

>

> > I'm fine to go back and use load instead of util but it's not robust IMO.

> >

>

> [...]

> >> What if there is spare capacity but no idle CPUs? In scenarios like this

> >> we should balance utilization. We could wait for a newidle balance to

> >

> > why should we balance anything ? all tasks have already enough running time.

> > It's better to wait for a cpu to become idle instead of trying to

> > predict which one will be idle first and migrate task uselessly

> > because other tasks can easily wakeup in the meantime

> >

>

> I probably need to play with this and create some synthetic use cases.

>

> What I had in mind is something like 2 CPUs, CPU0 running a 20% task and

> CPU1 running 6 10% tasks.

>

> If CPU0 runs the load balancer, balancing utilization would mean pulling

> 2 tasks from CPU1 to reach the domain-average of 40%. The good side of this

> is that we could save ourselves from running some newidle balances, but

> I'll admit that's all quite "finger in the air".


Don't forget that scheduler also selects a cpu when task wakeup and
should cope with such situation

>

> >> happen, but it'd be a shame to repeatedly do this when we could

> >> preemptively balance utilization.

> >>
Valentin Schneider July 29, 2019, 2:28 p.m. UTC | #21
On 26/07/2019 15:47, Vincent Guittot wrote:
[...]
>> If CPU0 runs the load balancer, balancing utilization would mean pulling

>> 2 tasks from CPU1 to reach the domain-average of 40%. The good side of this

>> is that we could save ourselves from running some newidle balances, but

>> I'll admit that's all quite "finger in the air".

> 

> Don't forget that scheduler also selects a cpu when task wakeup and

> should cope with such situation

> 


Right, although I'm not sure even the slow wakeup path is any good at
balancing utilization.

>>

>>>> happen, but it'd be a shame to repeatedly do this when we could

>>>> preemptively balance utilization.

>>>>
Srikar Dronamraju July 31, 2019, 1:43 p.m. UTC | #22
* Vincent Guittot <vincent.guittot@linaro.org> [2019-07-26 16:42:53]:

> On Fri, 26 Jul 2019 at 15:59, Srikar Dronamraju

> <srikar@linux.vnet.ibm.com> wrote:

> > > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

> > >               if (!can_migrate_task(p, env))

> > >                       goto next;

> > >

> > > -             load = task_h_load(p);

> > > +             if (env->src_grp_type == migrate_load) {

> > > +                     unsigned long load = task_h_load(p);

> > >

> > > -             if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> > > -                     goto next;

> > > +                     if (sched_feat(LB_MIN) &&

> > > +                         load < 16 && !env->sd->nr_balance_failed)

> > > +                             goto next;

> > > +

> > > +                     if ((load / 2) > env->imbalance)

> > > +                             goto next;

> >

> > I know this existed before too but if the load is exactly or around 2x of

> > env->imbalance, the resultant imbalance after the load balance operation

> > would still be around env->imbalance. We may lose some cache affinity too.

> >

> > Can we do something like.

> >                 if (2 * load > 3 * env->imbalance)

> >                         goto next;

> 

> TBH, I don't know what should be the best value and it's probably

> worth doing some investigation but i would prefer to do that as a

> separate patch to get a similar behavior in the overloaded case

> Why do you propose 3/2 instead of 2 ?

> 


If the imbalance is exactly or around load/2, then we still select the task to migrate
However after the migrate the imbalance will still be load/2.
- Can this lead to ping/pong?
- Did we lose out of cache though we didn't gain from an imbalance.

> > >

> > >

> > > -     if (sgs->sum_h_nr_running)

> > > -             sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> > > +     sgs->group_capacity = group->sgc->capacity;

> > >

> > >       sgs->group_weight = group->group_weight;

> > >

> > > -     sgs->group_no_capacity = group_is_overloaded(env, sgs);

> > > -     sgs->group_type = group_classify(group, sgs);

> > > +     sgs->group_type = group_classify(env, group, sgs);

> > > +

> > > +     /* Computing avg_load makes sense only when group is overloaded */

> > > +     if (sgs->group_type != group_overloaded)

> > > +             sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> > > +                             sgs->group_capacity;

> >

> > Mismatch in comment and code?

> 

> I may need to add more comments but at this step, the group should be

> either overloaded or fully busy but it can also be imbalanced.

> In case of a group fully busy or imbalanced (sgs->group_type !=

> group_overloaded), we haven't computed avg_load yet so we have to do

> so because:

> -In the case of fully_busy, we are going to be overloaded which the

> next step after fully busy when you are about to pull more load

> -In case of imbalance, we don't know the real state of the local group

> so we fall back to this default behavior

> 


We seem to be checking for avg_load when the group_type is group_overloaded.
But somehow I am don't see where sgs->avg_load is calculated for
group_overloaded case.

> >

> > We calculated avg_load for !group_overloaded case, but seem to be using for

> > group_overloaded cases too.

> 

> for group_overloaded case, we already computed it in update_sg_lb_stats()

> 



From update_sg_lb_stats()

	if (sgs->group_type != group_overloaded)
		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
				sgs->group_capacity;

So we seem to be skipping calculation of avg_load for group_overloaded. No?


-- 
Thanks and Regards
Srikar Dronamraju
Vincent Guittot July 31, 2019, 3:37 p.m. UTC | #23
On Wed, 31 Jul 2019 at 15:44, Srikar Dronamraju
<srikar@linux.vnet.ibm.com> wrote:
>

> * Vincent Guittot <vincent.guittot@linaro.org> [2019-07-26 16:42:53]:

>

> > On Fri, 26 Jul 2019 at 15:59, Srikar Dronamraju

> > <srikar@linux.vnet.ibm.com> wrote:

> > > > @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)

> > > >               if (!can_migrate_task(p, env))

> > > >                       goto next;

> > > >

> > > > -             load = task_h_load(p);

> > > > +             if (env->src_grp_type == migrate_load) {

> > > > +                     unsigned long load = task_h_load(p);

> > > >

> > > > -             if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

> > > > -                     goto next;

> > > > +                     if (sched_feat(LB_MIN) &&

> > > > +                         load < 16 && !env->sd->nr_balance_failed)

> > > > +                             goto next;

> > > > +

> > > > +                     if ((load / 2) > env->imbalance)

> > > > +                             goto next;

> > >

> > > I know this existed before too but if the load is exactly or around 2x of

> > > env->imbalance, the resultant imbalance after the load balance operation

> > > would still be around env->imbalance. We may lose some cache affinity too.

> > >

> > > Can we do something like.

> > >                 if (2 * load > 3 * env->imbalance)

> > >                         goto next;

> >

> > TBH, I don't know what should be the best value and it's probably

> > worth doing some investigation but i would prefer to do that as a

> > separate patch to get a similar behavior in the overloaded case

> > Why do you propose 3/2 instead of 2 ?

> >

>

> If the imbalance is exactly or around load/2, then we still select the task to migrate

> However after the migrate the imbalance will still be load/2.

> - Can this lead to ping/pong?


In some case you're probably right but this might also help to move
other tasks when  3/2 will not.

1st example with 3 tasks with almost same load on 2 cpus, we will
probably end up migrating the waiting task with a load  / 2 ~<
env->imbalance whereas 2 * load > 3 * env->imbalance will not move any
task.
Note that we don't ensure fairness between the 3 tasks in the latter
case. TBH I don't know what is better

2nd example with 2 tasks TA and TB  with a load around L and 4 tasks
T0 T1 T2 T3 with a load around L/4
TA and TB are on CPU0 and T0 to T3 are on CPU1, we have the same
imbalance as previous example 2L on CPU0 and L on CPU1.
With load / 2 > env->imbalance, we will migrate TA or TB to CPU1 and
then, we might end up migrating 2 tasks among T0 to T3 and the system
will be balanced
With 2 * load > 3 * env->imbalance, we will not migrate TA or TB
because task load is higher the the imbalance which is L/2 and the
scheduler will never get a chance to balance the system whereas it
could have reached a balanced state

Just to say that I'm not sure that there one best ratio to use

> - Did we lose out of cache though we didn't gain from an imbalance.

>

> > > >

> > > >

> > > > -     if (sgs->sum_h_nr_running)

> > > > -             sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;

> > > > +     sgs->group_capacity = group->sgc->capacity;

> > > >

> > > >       sgs->group_weight = group->group_weight;

> > > >

> > > > -     sgs->group_no_capacity = group_is_overloaded(env, sgs);

> > > > -     sgs->group_type = group_classify(group, sgs);

> > > > +     sgs->group_type = group_classify(env, group, sgs);

> > > > +

> > > > +     /* Computing avg_load makes sense only when group is overloaded */

> > > > +     if (sgs->group_type != group_overloaded)

> > > > +             sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

> > > > +                             sgs->group_capacity;

> > >

> > > Mismatch in comment and code?

> >

> > I may need to add more comments but at this step, the group should be

> > either overloaded or fully busy but it can also be imbalanced.

> > In case of a group fully busy or imbalanced (sgs->group_type !=

> > group_overloaded), we haven't computed avg_load yet so we have to do

> > so because:

> > -In the case of fully_busy, we are going to be overloaded which the

> > next step after fully busy when you are about to pull more load

> > -In case of imbalance, we don't know the real state of the local group

> > so we fall back to this default behavior

> >

>

> We seem to be checking for avg_load when the group_type is group_overloaded.

> But somehow I am don't see where sgs->avg_load is calculated for

> group_overloaded case.


My fault, I read to quickly your comment and thought that you were
referring to calculte_imbalance() but your comment is about
update_sg_lb_stats().
As Peter mentioned previously this should be
  if (sgs->group_type == group_overloaded)
instead of
  if (sgs->group_type != group_overloaded)

>

> > >

> > > We calculated avg_load for !group_overloaded case, but seem to be using for

> > > group_overloaded cases too.

> >

> > for group_overloaded case, we already computed it in update_sg_lb_stats()

> >

>

>

> From update_sg_lb_stats()

>

>         if (sgs->group_type != group_overloaded)

>                 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /

>                                 sgs->group_capacity;

>

> So we seem to be skipping calculation of avg_load for group_overloaded. No?


yes.

Thanks
Vincent

>

>

> --

> Thanks and Regards

> Srikar Dronamraju

>
diff mbox series

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 67f0acd..472959df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3771,7 +3771,7 @@  static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 		return;
 	}
 
-	rq->misfit_task_load = task_h_load(p);
+	rq->misfit_task_load = task_util_est(p);
 }
 
 #else /* CONFIG_SMP */
@@ -5376,18 +5376,6 @@  static unsigned long capacity_of(int cpu)
 	return cpu_rq(cpu)->cpu_capacity;
 }
 
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = cpu_runnable_load(rq);
-
-	if (nr_running)
-		return load_avg / nr_running;
-
-	return 0;
-}
-
 static void record_wakee(struct task_struct *p)
 {
 	/*
@@ -7060,12 +7048,21 @@  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 enum fbq_type { regular, remote, all };
 
 enum group_type {
-	group_other = 0,
+	group_has_spare = 0,
+	group_fully_busy,
 	group_misfit_task,
+	group_asym_capacity,
 	group_imbalanced,
 	group_overloaded,
 };
 
+enum group_migration {
+	migrate_task = 0,
+	migrate_util,
+	migrate_load,
+	migrate_misfit,
+};
+
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
@@ -7096,7 +7093,7 @@  struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
-	enum group_type		src_grp_type;
+	enum group_migration	src_grp_type;
 	struct list_head	tasks;
 };
 
@@ -7328,7 +7325,6 @@  static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
-	unsigned long load;
 	int detached = 0;
 
 	lockdep_assert_held(&env->src_rq->lock);
@@ -7361,19 +7357,46 @@  static int detach_tasks(struct lb_env *env)
 		if (!can_migrate_task(p, env))
 			goto next;
 
-		load = task_h_load(p);
+		if (env->src_grp_type == migrate_load) {
+			unsigned long load = task_h_load(p);
 
-		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
-			goto next;
+			if (sched_feat(LB_MIN) &&
+			    load < 16 && !env->sd->nr_balance_failed)
+				goto next;
+
+			if ((load / 2) > env->imbalance)
+				goto next;
+
+			env->imbalance -= load;
+		} else	if (env->src_grp_type == migrate_util) {
+			unsigned long util = task_util_est(p);
+
+			if (util > env->imbalance)
+				goto next;
+
+			env->imbalance -= util;
+		} else if (env->src_grp_type == migrate_misfit) {
+			unsigned long util = task_util_est(p);
+
+			/*
+			 * utilization of misfit task might decrease a bit
+			 * since it has been recorded. Be conservative in the
+			 * condition.
+			 */
+			if (2*util < env->imbalance)
+				goto next;
+
+			env->imbalance = 0;
+		} else {
+			/* Migrate task */
+			env->imbalance--;
+		}
 
-		if ((load / 2) > env->imbalance)
-			goto next;
 
 		detach_task(p, env);
 		list_add(&p->se.group_node, &env->tasks);
 
 		detached++;
-		env->imbalance -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -7646,7 +7669,6 @@  static unsigned long task_h_load(struct task_struct *p)
 struct sg_lb_stats {
 	unsigned long avg_load; /*Avg load across the CPUs of the group */
 	unsigned long group_load; /* Total load over the CPUs of the group */
-	unsigned long load_per_task;
 	unsigned long group_capacity;
 	unsigned long group_util; /* Total utilization of the group */
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
@@ -7654,8 +7676,7 @@  struct sg_lb_stats {
 	unsigned int idle_cpus;
 	unsigned int group_weight;
 	enum group_type group_type;
-	int group_no_capacity;
-	int group_asym_capacity;
+	unsigned int group_asym_capacity; /* tasks should be move to preferred cpu */
 	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
@@ -7670,10 +7691,10 @@  struct sg_lb_stats {
 struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
-	unsigned long total_running;
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
+	unsigned int prefer_sibling; /* tasks should go to sibling first */
 
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
@@ -7690,14 +7711,14 @@  static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	*sds = (struct sd_lb_stats){
 		.busiest = NULL,
 		.local = NULL,
-		.total_running = 0UL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
 		.busiest_stat = {
 			.avg_load = 0UL,
 			.sum_nr_running = 0,
 			.sum_h_nr_running = 0,
-			.group_type = group_other,
+			.idle_cpus = UINT_MAX,
+			.group_type = group_has_spare,
 		},
 	};
 }
@@ -7887,7 +7908,7 @@  static inline int sg_imbalanced(struct sched_group *group)
 static inline bool
 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_h_nr_running < sgs->group_weight)
+	if (sgs->sum_nr_running < sgs->group_weight)
 		return true;
 
 	if ((sgs->group_capacity * 100) >
@@ -7908,7 +7929,7 @@  group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_h_nr_running <= sgs->group_weight)
+	if (sgs->sum_nr_running <= sgs->group_weight)
 		return false;
 
 	if ((sgs->group_capacity * 100) <
@@ -7941,10 +7962,11 @@  group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 }
 
 static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(struct lb_env *env,
+			  struct sched_group *group,
 			  struct sg_lb_stats *sgs)
 {
-	if (sgs->group_no_capacity)
+	if (group_is_overloaded(env, sgs))
 		return group_overloaded;
 
 	if (sg_imbalanced(group))
@@ -7953,7 +7975,13 @@  group_type group_classify(struct sched_group *group,
 	if (sgs->group_misfit_task_load)
 		return group_misfit_task;
 
-	return group_other;
+	if (sgs->group_asym_capacity)
+		return group_asym_capacity;
+
+	if (group_has_capacity(env, sgs))
+		return group_has_spare;
+
+	return group_fully_busy;
 }
 
 static bool update_nohz_stats(struct rq *rq, bool force)
@@ -7990,10 +8018,12 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 				      struct sg_lb_stats *sgs,
 				      int *sg_status)
 {
-	int i, nr_running;
+	int i, nr_running, local_group;
 
 	memset(sgs, 0, sizeof(*sgs));
 
+	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
@@ -8003,9 +8033,9 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_load += cpu_runnable_load(rq);
 		sgs->group_util += cpu_util(i);
 		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
-		sgs->sum_nr_running += rq->cfs.h_nr_running;
-
 		nr_running = rq->nr_running;
+		sgs->sum_nr_running += nr_running;
+
 		if (nr_running > 1)
 			*sg_status |= SG_OVERLOAD;
 
@@ -8022,6 +8052,10 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
 
+		if (local_group)
+			continue;
+
+		/* Check for a misfit task on the cpu */
 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
 		    sgs->group_misfit_task_load < rq->misfit_task_load) {
 			sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8029,17 +8063,24 @@  static inline void update_sg_lb_stats(struct lb_env *env,
 		}
 	}
 
-	/* Adjust by relative CPU capacity of the group */
-	sgs->group_capacity = group->sgc->capacity;
-	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+	/* Check if dst cpu is idle and preferred to this group */
+	if (env->sd->flags & SD_ASYM_PACKING &&
+	    env->idle != CPU_NOT_IDLE &&
+	    sgs->sum_h_nr_running &&
+	    sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+		sgs->group_asym_capacity = 1;
+	}
 
-	if (sgs->sum_h_nr_running)
-		sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;
+	sgs->group_capacity = group->sgc->capacity;
 
 	sgs->group_weight = group->group_weight;
 
-	sgs->group_no_capacity = group_is_overloaded(env, sgs);
-	sgs->group_type = group_classify(group, sgs);
+	sgs->group_type = group_classify(env, group, sgs);
+
+	/* Computing avg_load makes sense only when group is overloaded */
+	if (sgs->group_type != group_overloaded)
+		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+				sgs->group_capacity;
 }
 
 /**
@@ -8070,7 +8111,7 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 	 */
 	if (sgs->group_type == group_misfit_task &&
 	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
-	     !group_has_capacity(env, &sds->local_stat)))
+	     sds->local_stat.group_type != group_has_spare))
 		return false;
 
 	if (sgs->group_type > busiest->group_type)
@@ -8079,11 +8120,18 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
-	if (sgs->avg_load <= busiest->avg_load)
+	/* Select the overloaded group with highest avg_load */
+	if (sgs->group_type == group_overloaded &&
+	    sgs->avg_load <= busiest->avg_load)
+		return false;
+
+	/* Prefer to move from lowest priority CPU's work */
+	if (sgs->group_type == group_asym_capacity && sds->busiest &&
+	    sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
 		return false;
 
 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
-		goto asym_packing;
+		goto spare_capacity;
 
 	/*
 	 * Candidate sg has no more than one task per CPU and
@@ -8091,7 +8139,7 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 	 * capable CPUs may harm throughput. Maximize throughput,
 	 * power/energy consequences are not considered.
 	 */
-	if (sgs->sum_h_nr_running <= sgs->group_weight &&
+	if (sgs->group_type <= group_fully_busy &&
 	    group_smaller_min_cpu_capacity(sds->local, sg))
 		return false;
 
@@ -8102,39 +8150,32 @@  static bool update_sd_pick_busiest(struct lb_env *env,
 	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
 		return false;
 
-asym_packing:
-	/* This is the busiest node in its class. */
-	if (!(env->sd->flags & SD_ASYM_PACKING))
-		return true;
-
-	/* No ASYM_PACKING if target CPU is already busy */
-	if (env->idle == CPU_NOT_IDLE)
-		return true;
+spare_capacity:
 	/*
-	 * ASYM_PACKING needs to move all the work to the highest
-	 * prority CPUs in the group, therefore mark all groups
-	 * of lower priority than ourself as busy.
-	 *
-	 * This is primarily intended to used at the sibling level.  Some
-	 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
-	 * case of POWER7, it can move to lower SMT modes only when higher
-	 * threads are idle.  When in lower SMT modes, the threads will
-	 * perform better since they share less core resources.  Hence when we
-	 * have idle threads, we want them to be the higher ones.
-	 */
-	if (sgs->sum_h_nr_running &&
-	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
-		sgs->group_asym_capacity = 1;
-		if (!sds->busiest)
-			return true;
+	 * Select not overloaded group with lowest number of idle cpus.
+	 * We could also compare the spare capacity which is more stable
+	 * but it can end up that the group has less spare capacity but
+	 * finally more idle cpus which means less opportunity to pull
+	 * tasks.
+	 */
+	if (sgs->group_type == group_has_spare &&
+	    sgs->idle_cpus > busiest->idle_cpus)
+		return false;
 
-		/* Prefer to move from lowest priority CPU's work */
-		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
-				      sg->asym_prefer_cpu))
-			return true;
-	}
+	/*
+	 * Select the fully busy group with highest avg_load.
+	 * In theory, there is no need to pull task from such kind of group
+	 * because tasks have all compute capacity that they need but we can
+	 * still improve the overall throughput by reducing contention
+	 * when accessing shared HW resources.
+	 * XXX for now avg_load is not computed and always 0 so we select the
+	 * 1st one.
+	 */
+	if (sgs->group_type == group_fully_busy &&
+	    sgs->avg_load <= busiest->avg_load)
+		return false;
 
-	return false;
+	return true;
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -8172,13 +8213,13 @@  static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  * @env: The load balancing environment.
  * @sds: variable to hold the statistics for this sched_domain.
  */
+
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
-	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 	int sg_status = 0;
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -8205,22 +8246,6 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		if (local_group)
 			goto next_group;
 
-		/*
-		 * In case the child domain prefers tasks go to siblings
-		 * first, lower the sg capacity so that we'll try
-		 * and move all the excess tasks away. We lower the capacity
-		 * of a group only if the local group has the capacity to fit
-		 * these excess tasks. The extra check prevents the case where
-		 * you always pull from the heaviest group when it is already
-		 * under-utilized (possible with a large weight task outweighs
-		 * the tasks on the system).
-		 */
-		if (prefer_sibling && sds->local &&
-		    group_has_capacity(env, local) &&
-		    (sgs->sum_h_nr_running > local->sum_h_nr_running + 1)) {
-			sgs->group_no_capacity = 1;
-			sgs->group_type = group_classify(sg, sgs);
-		}
 
 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
@@ -8229,13 +8254,15 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 next_group:
 		/* Now, start updating sd_lb_stats */
-		sds->total_running += sgs->sum_h_nr_running;
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
+	/* Tag domain that child domain prefers tasks go to siblings first */
+	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
 #ifdef CONFIG_NO_HZ_COMMON
 	if ((env->flags & LBF_NOHZ_AGAIN) &&
 	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8266,76 +8293,6 @@  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 }
 
 /**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *			amongst the groups of a sched_domain, during
- *			load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
-	unsigned long tmp, capa_now = 0, capa_move = 0;
-	unsigned int imbn = 2;
-	unsigned long scaled_busy_load_per_task;
-	struct sg_lb_stats *local, *busiest;
-
-	local = &sds->local_stat;
-	busiest = &sds->busiest_stat;
-
-	if (!local->sum_h_nr_running)
-		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
-	else if (busiest->load_per_task > local->load_per_task)
-		imbn = 1;
-
-	scaled_busy_load_per_task =
-		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-		busiest->group_capacity;
-
-	if (busiest->avg_load + scaled_busy_load_per_task >=
-	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
-		env->imbalance = busiest->load_per_task;
-		return;
-	}
-
-	/*
-	 * OK, we don't have enough imbalance to justify moving tasks,
-	 * however we may be able to increase total CPU capacity used by
-	 * moving them.
-	 */
-
-	capa_now += busiest->group_capacity *
-			min(busiest->load_per_task, busiest->avg_load);
-	capa_now += local->group_capacity *
-			min(local->load_per_task, local->avg_load);
-	capa_now /= SCHED_CAPACITY_SCALE;
-
-	/* Amount of load we'd subtract */
-	if (busiest->avg_load > scaled_busy_load_per_task) {
-		capa_move += busiest->group_capacity *
-			    min(busiest->load_per_task,
-				busiest->avg_load - scaled_busy_load_per_task);
-	}
-
-	/* Amount of load we'd add */
-	if (busiest->avg_load * busiest->group_capacity <
-	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
-		tmp = (busiest->avg_load * busiest->group_capacity) /
-		      local->group_capacity;
-	} else {
-		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-		      local->group_capacity;
-	}
-	capa_move += local->group_capacity *
-		    min(local->load_per_task, local->avg_load + tmp);
-	capa_move /= SCHED_CAPACITY_SCALE;
-
-	/* Move if we gain throughput */
-	if (capa_move > capa_now)
-		env->imbalance = busiest->load_per_task;
-}
-
-/**
  * calculate_imbalance - Calculate the amount of imbalance present within the
  *			 groups of a given sched_domain during load balance.
  * @env: load balance environment
@@ -8343,13 +8300,17 @@  void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
  */
 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	unsigned long max_pull, load_above_capacity = ~0UL;
 	struct sg_lb_stats *local, *busiest;
 
 	local = &sds->local_stat;
 	busiest = &sds->busiest_stat;
 
-	if (busiest->group_asym_capacity) {
+	if (busiest->group_type == group_asym_capacity) {
+		/*
+		 * In case of asym capacity, we will try to migrate all load
+		 * to the preferred CPU
+		 */
+		env->src_grp_type = migrate_load;
 		env->imbalance = busiest->group_load;
 		return;
 	}
@@ -8357,72 +8318,115 @@  static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	if (busiest->group_type == group_imbalanced) {
 		/*
 		 * In the group_imb case we cannot rely on group-wide averages
-		 * to ensure CPU-load equilibrium, look at wider averages. XXX
+		 * to ensure CPU-load equilibrium, try to move any task to fix
+		 * the imbalance. The next load balance will take care of
+		 * balancing back the system.
 		 */
-		busiest->load_per_task =
-			min(busiest->load_per_task, sds->avg_load);
+		env->src_grp_type = migrate_task;
+		env->imbalance = 1;
+		return;
 	}
 
-	/*
-	 * Avg load of busiest sg can be less and avg load of local sg can
-	 * be greater than avg load across all sgs of sd because avg load
-	 * factors in sg capacity and sgs with smaller group_type are
-	 * skipped when updating the busiest sg:
-	 */
-	if (busiest->group_type != group_misfit_task &&
-	    (busiest->avg_load <= sds->avg_load ||
-	     local->avg_load >= sds->avg_load)) {
-		env->imbalance = 0;
-		return fix_small_imbalance(env, sds);
+	if (busiest->group_type == group_misfit_task) {
+		/* Set imbalance to allow misfit task to be balanced. */
+		env->src_grp_type = migrate_misfit;
+		env->imbalance = busiest->group_misfit_task_load;
+		return;
 	}
 
 	/*
-	 * If there aren't any idle CPUs, avoid creating some.
+	 * Try to use spare capacity of local group without overloading it or
+	 * emptying busiest
 	 */
-	if (busiest->group_type == group_overloaded &&
-	    local->group_type   == group_overloaded) {
-		load_above_capacity = busiest->sum_h_nr_running * SCHED_CAPACITY_SCALE;
-		if (load_above_capacity > busiest->group_capacity) {
-			load_above_capacity -= busiest->group_capacity;
-			load_above_capacity *= scale_load_down(NICE_0_LOAD);
-			load_above_capacity /= busiest->group_capacity;
-		} else
-			load_above_capacity = ~0UL;
+	if (local->group_type == group_has_spare) {
+		long imbalance;
+
+		/*
+		 * If there is no overload, we just want to even the number of
+		 * idle cpus.
+		 */
+		env->src_grp_type = migrate_task;
+		imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1);
+
+		if (sds->prefer_sibling)
+			/*
+			 * When prefer sibling, evenly spread running tasks on
+			 * groups.
+			 */
+			imbalance = (busiest->sum_nr_running - local->sum_nr_running) >> 1;
+
+		if (busiest->group_type > group_fully_busy) {
+			/*
+			 * If busiest is overloaded, try to fill spare
+			 * capacity. This might end up creating spare capacity
+			 * in busiest or busiest still being overloaded but
+			 * there is no simple way to directly compute the
+			 * amount of load to migrate in order to balance the
+			 * system.
+			 */
+			env->src_grp_type = migrate_util;
+			imbalance = max(local->group_capacity, local->group_util) -
+				    local->group_util;
+		}
+
+		env->imbalance = imbalance;
+		return;
 	}
 
 	/*
-	 * We're trying to get all the CPUs to the average_load, so we don't
-	 * want to push ourselves above the average load, nor do we wish to
-	 * reduce the max loaded CPU below the average load. At the same time,
-	 * we also don't want to reduce the group load below the group
-	 * capacity. Thus we look for the minimum possible imbalance.
+	 * Local is fully busy but have to take more load to relieve the
+	 * busiest group
 	 */
-	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
+	if (local->group_type < group_overloaded) {
+		/*
+		 * Local will become overvloaded so the avg_load metrics are
+		 * finally needed
+		 */
 
-	/* How much load to actually move to equalise the imbalance */
-	env->imbalance = min(
-		max_pull * busiest->group_capacity,
-		(sds->avg_load - local->avg_load) * local->group_capacity
-	) / SCHED_CAPACITY_SCALE;
+		local->avg_load = (local->group_load*SCHED_CAPACITY_SCALE)
+						/ local->group_capacity;
 
-	/* Boost imbalance to allow misfit task to be balanced. */
-	if (busiest->group_type == group_misfit_task) {
-		env->imbalance = max_t(long, env->imbalance,
-				       busiest->group_misfit_task_load);
+		sds->avg_load = (SCHED_CAPACITY_SCALE * sds->total_load)
+						/ sds->total_capacity;
 	}
 
 	/*
-	 * if *imbalance is less than the average load per runnable task
-	 * there is no guarantee that any tasks will be moved so we'll have
-	 * a think about bumping its value to force at least one task to be
-	 * moved
+	 * Both group are or will become overloaded and we're trying to get
+	 * all the CPUs to the average_load, so we don't want to push
+	 * ourselves above the average load, nor do we wish to reduce the
+	 * max loaded CPU below the average load. At the same time, we also
+	 * don't want to reduce the group load below the group capacity.
+	 * Thus we look for the minimum possible imbalance.
 	 */
-	if (env->imbalance < busiest->load_per_task)
-		return fix_small_imbalance(env, sds);
+	env->src_grp_type = migrate_load;
+	env->imbalance = min(
+		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
+		(sds->avg_load - local->avg_load) * local->group_capacity
+	) / SCHED_CAPACITY_SCALE;
 }
 
 /******* find_busiest_group() helpers end here *********************/
 
+/*
+ * Decision matrix according to the local and busiest group state
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
+ * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
+ * misfit_task      force     N/A        N/A    N/A  force      force
+ * asym_capacity    force     force      N/A    N/A  force      force
+ * imbalanced       force     force      N/A    N/A  force      force
+ * overloaded       force     force      N/A    N/A  force      avg_load
+ *
+ * N/A :      Not Applicable because already filtered while updating
+ *            statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force :    Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle :  dst_cpu is not busy and the number of idle cpus is quite
+ *            different in groups.
+ */
+
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
  * if there is an imbalance.
@@ -8457,17 +8461,13 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 	local = &sds.local_stat;
 	busiest = &sds.busiest_stat;
 
-	/* ASYM feature bypasses nice load balance check */
-	if (busiest->group_asym_capacity)
-		goto force_balance;
-
 	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest || busiest->sum_h_nr_running == 0)
 		goto out_balanced;
 
-	/* XXX broken for overlapping NUMA groups */
-	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
-						/ sds.total_capacity;
+	/* ASYM feature bypasses nice load balance check */
+	if (busiest->group_type == group_asym_capacity)
+		goto force_balance;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
@@ -8477,14 +8477,6 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
-	/*
-	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
-	 * capacities from resulting in underutilization due to avg_load.
-	 */
-	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
-	    busiest->group_no_capacity)
-		goto force_balance;
-
 	/* Misfit tasks should be dealt with regardless of the avg load */
 	if (busiest->group_type == group_misfit_task)
 		goto force_balance;
@@ -8493,44 +8485,68 @@  static struct sched_group *find_busiest_group(struct lb_env *env)
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
 	 */
-	if (local->avg_load >= busiest->avg_load)
+	if (local->group_type > busiest->group_type)
 		goto out_balanced;
 
 	/*
-	 * Don't pull any tasks if this group is already above the domain
-	 * average load.
+	 * When groups are overloaded, use the avg_load to ensure fairness
+	 * between tasks.
 	 */
-	if (local->avg_load >= sds.avg_load)
-		goto out_balanced;
+	if (local->group_type == group_overloaded) {
+		/*
+		 * If the local group is more loaded than the selected
+		 * busiest group don't try and pull any tasks.
+		 */
+		if (local->avg_load >= busiest->avg_load)
+			goto out_balanced;
+
+		/* XXX broken for overlapping NUMA groups */
+		sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+						/ sds.total_capacity;
 
-	if (env->idle == CPU_IDLE) {
 		/*
-		 * This CPU is idle. If the busiest group is not overloaded
-		 * and there is no imbalance between this and busiest group
-		 * wrt idle CPUs, it is balanced. The imbalance becomes
-		 * significant if the diff is greater than 1 otherwise we
-		 * might end up to just move the imbalance on another group
+		 * Don't pull any tasks if this group is already above the
+		 * domain average load.
 		 */
-		if ((busiest->group_type != group_overloaded) &&
-				(local->idle_cpus <= (busiest->idle_cpus + 1)))
+		if (local->avg_load >= sds.avg_load)
 			goto out_balanced;
-	} else {
+
 		/*
-		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
-		 * imbalance_pct to be conservative.
+		 * If the busiest group is more loaded, use imbalance_pct to be
+		 * conservative.
 		 */
 		if (100 * busiest->avg_load <=
 				env->sd->imbalance_pct * local->avg_load)
 			goto out_balanced;
+
 	}
 
+	/* Try to move all excess tasks to child's sibling domain */
+	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+	    busiest->sum_nr_running > local->sum_nr_running + 1)
+		goto force_balance;
+
+	if (busiest->group_type != group_overloaded &&
+	     (env->idle == CPU_NOT_IDLE ||
+	      local->idle_cpus <= (busiest->idle_cpus + 1)))
+		/*
+		 * If the busiest group is not overloaded
+		 * and there is no imbalance between this and busiest group
+		 * wrt idle CPUs, it is balanced. The imbalance
+		 * becomes significant if the diff is greater than 1 otherwise
+		 * we might end up to just move the imbalance on another
+		 * group.
+		 */
+		goto out_balanced;
+
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
-	env->src_grp_type = busiest->group_type;
 	calculate_imbalance(env, &sds);
+
 	return env->imbalance ? sds.busiest : NULL;
 
 out_balanced:
+
 	env->imbalance = 0;
 	return NULL;
 }
@@ -8542,11 +8558,13 @@  static struct rq *find_busiest_queue(struct lb_env *env,
 				     struct sched_group *group)
 {
 	struct rq *busiest = NULL, *rq;
-	unsigned long busiest_load = 0, busiest_capacity = 1;
+	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+	unsigned int busiest_nr = 0;
 	int i;
 
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
-		unsigned long capacity, load;
+		unsigned long capacity, load, util;
+		unsigned int nr_running;
 		enum fbq_type rt;
 
 		rq = cpu_rq(i);
@@ -8578,7 +8596,7 @@  static struct rq *find_busiest_queue(struct lb_env *env,
 		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
 		 * seek the "biggest" misfit task.
 		 */
-		if (env->src_grp_type == group_misfit_task) {
+		if (env->src_grp_type == migrate_misfit) {
 			if (rq->misfit_task_load > busiest_load) {
 				busiest_load = rq->misfit_task_load;
 				busiest = rq;
@@ -8600,12 +8618,33 @@  static struct rq *find_busiest_queue(struct lb_env *env,
 		    rq->nr_running == 1)
 			continue;
 
-		load = cpu_runnable_load(rq);
+		if (env->src_grp_type == migrate_task) {
+			nr_running = rq->cfs.h_nr_running;
+
+			if (busiest_nr < nr_running) {
+				busiest_nr = nr_running;
+				busiest = rq;
+			}
+
+			continue;
+		}
+
+		if (env->src_grp_type == migrate_util) {
+			util = cpu_util(cpu_of(rq));
+
+			if (busiest_util < util) {
+				busiest_util = util;
+				busiest = rq;
+			}
+
+			continue;
+		}
 
 		/*
-		 * When comparing with imbalance, use cpu_runnable_load()
+		 * When comparing with load imbalance, use weighted_cpuload()
 		 * which is not scaled with the CPU capacity.
 		 */
+		load = cpu_runnable_load(rq);
 
 		if (rq->nr_running == 1 && load > env->imbalance &&
 		    !check_cpu_capacity(rq, env->sd))
@@ -8671,7 +8710,7 @@  voluntary_active_balance(struct lb_env *env)
 			return 1;
 	}
 
-	if (env->src_grp_type == group_misfit_task)
+	if (env->src_grp_type == migrate_misfit)
 		return 1;
 
 	return 0;