diff mbox

[v2,02/13] sched/fair: Consistent use of prev_cpu in wakeup path

Message ID 1466615004-3503-3-git-send-email-morten.rasmussen@arm.com
State Superseded
Headers show

Commit Message

Morten Rasmussen June 22, 2016, 5:03 p.m. UTC
In commit ac66f5477239 ("sched/numa: Introduce migrate_swap()")
select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu
in special cases (NUMA task swapping). However, the
select_task_rq_fair() helper functions: wake_affine() and
select_idle_sibling(), still use task_cpu(p) directly to work out
prev_cpu which leads to inconsistencies.

This patch passes prev_cpu (potentially overridden by NUMA code) into
the helper functions to ensure prev_cpu is indeed the same cpu
everywhere in the wakeup path.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
cc: Rik van Riel <riel@redhat.com>

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>

---
 kernel/sched/fair.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

-- 
1.9.1

Comments

Morten Rasmussen June 23, 2016, 9:56 a.m. UTC | #1
On Wed, Jun 22, 2016 at 02:04:11PM -0400, Rik van Riel wrote:
> On Wed, 2016-06-22 at 18:03 +0100, Morten Rasmussen wrote:

> > In commit ac66f5477239 ("sched/numa: Introduce migrate_swap()")

> > select_task_rq() got a 'cpu' argument to enable overriding of

> > prev_cpu

> > in special cases (NUMA task swapping). However, the

> > select_task_rq_fair() helper functions: wake_affine() and

> > select_idle_sibling(), still use task_cpu(p) directly to work out

> > prev_cpu which leads to inconsistencies.

> > 

> > This patch passes prev_cpu (potentially overridden by NUMA code) into

> > the helper functions to ensure prev_cpu is indeed the same cpu

> > everywhere in the wakeup path.

> > 

> > cc: Ingo Molnar <mingo@redhat.com>

> > cc: Peter Zijlstra <peterz@infradead.org>

> > cc: Rik van Riel <riel@redhat.com>

> > 

> > Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>

> > ---

> >  kernel/sched/fair.c | 24 +++++++++++++-----------

> >  1 file changed, 13 insertions(+), 11 deletions(-)

> > 

> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> > index c6dd8bab010c..eec8e29104f9 100644

> > --- a/kernel/sched/fair.c

> > +++ b/kernel/sched/fair.c

> > @@ -656,7 +656,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq,

> > struct sched_entity *se)

> >  }

> >  

> >  #ifdef CONFIG_SMP

> > -static int select_idle_sibling(struct task_struct *p, int cpu);

> > +static int select_idle_sibling(struct task_struct *p, int prev_cpu,

> > int cpu);

> >  static unsigned long task_h_load(struct task_struct *p);

> >  

> >  /*

> > @@ -1483,7 +1483,8 @@ static void task_numa_compare(struct

> > task_numa_env *env,

> >  	 * Call select_idle_sibling to maybe find a better one.

> >  	 */

> >  	if (!cur)

> > -		env->dst_cpu = select_idle_sibling(env->p, env-

> > >dst_cpu);

> > +		env->dst_cpu = select_idle_sibling(env->p, env-

> > >src_cpu,

> > +						   env->dst_cpu);

> 

> It is worth remembering that "prev" will only

> ever be returned by select_idle_sibling() if

> it is part of the same NUMA node as target.

> 

> That means this patch does not change behaviour

> of the NUMA balancing code, since that always

> migrates between nodes.

> 

> Now lets look at try_to_wake_up(). It will pass

> p->wake_cpu as the argument for "prev_cpu", which

> again appears to be the same CPU number as that used

> by the current code.


IIUC, p->wake_cpu != task_cpu(p) if task_numa_migrate() decided to call
migrate_swap() on the task while it was sleeping intending it to swap
places with a task on a different NUMA node when it wakes up. Using
p->wake_cpu in select_idle_sibling() as "prev_cpu" when called through
try_to_wake_up()->select_task_rq() should only make a difference if the
target cpu happens to share cache with it and it is idle.

	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
		return prev;

The selection of the target cpu for select_idle_sibling() is also
slightly affected as wake_affine() currently compares task_cpu(p) and
smp_processor_id(), and then picks p->wake_cpu or smp_processor_id()
depending on the outcome. With this patch wake_affine() uses
p->wake_cpu instead of task_cpu(p) so we actually compare the candidates
we choose between.

I think that would lead to some minor changes in behaviour in a few
corner cases, but I mainly wrote the patch as I thought it was very
confusing that we could have different "prev_cpu"s in different parts of
the select_task_rq_fair() code path.

> 

> I have no objection to your patch, but must be

> overlooking something, since I cannot find a change

> in behaviour that your patch would create.


Thanks for confirming that it shouldn't change anything for NUMA load
balancing. That is what I hope for :-)
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c6dd8bab010c..eec8e29104f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -656,7 +656,7 @@  static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -1483,7 +1483,8 @@  static void task_numa_compare(struct task_numa_env *env,
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
 	if (!cur)
-		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+						   env->dst_cpu);
 
 assign:
 	task_numa_assign(env, cur, imp);
@@ -4985,18 +4986,18 @@  static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+		       int prev_cpu, int sync)
 {
 	s64 this_load, load;
 	s64 this_eff_load, prev_eff_load;
-	int idx, this_cpu, prev_cpu;
+	int idx, this_cpu;
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
-	prev_cpu  = task_cpu(p);
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
@@ -5161,11 +5162,10 @@  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
-	int i = task_cpu(p);
 
 	if (idle_cpu(target))
 		return target;
@@ -5173,8 +5173,8 @@  static int select_idle_sibling(struct task_struct *p, int target)
 	/*
 	 * If the prevous cpu is cache affine and idle, don't be stupid.
 	 */
-	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-		return i;
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+		return prev;
 
 	/*
 	 * Otherwise, iterate the domains and find an eligible idle cpu.
@@ -5195,6 +5195,8 @@  static int select_idle_sibling(struct task_struct *p, int target)
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
 		do {
+			int i;
+
 			if (!cpumask_intersects(sched_group_cpus(sg),
 						tsk_cpus_allowed(p)))
 				goto next;
@@ -5303,13 +5305,13 @@  select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
 			new_cpu = cpu;
 	}
 
 	if (!sd) {
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-			new_cpu = select_idle_sibling(p, new_cpu);
+			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
 	} else while (sd) {
 		struct sched_group *group;