[3/4] sched,cgroup: Fix cpu_cgroup_fork()

Message ID 20160617120454.080767343@infradead.org
State New
Headers show

Commit Message

Peter Zijlstra June 17, 2016, 12:01 p.m.
From: Vincent Guittot <vincent.guittot@linaro.org>


A new fair task is detached and attached from/to task_group with:

  cgroup_post_fork()
    ss->fork(child) := cpu_cgroup_fork()
      sched_move_task()
        task_move_group_fair()

Which is wrong, because at this point in fork() the task isn't fully
initialized and it cannot 'move' to another group, because its not
attached to any group as yet.

In fact, cpu_cgroup_fork needs a small part of sched_move_task so we
can just call this small part directly instead sched_move_task. And
the task doesn't really migrate because it is not yet attached so we
need the sequence:

  do_fork()
    sched_fork()
      __set_task_cpu()

    cgroup_post_fork()
      set_task_rq() # set task group and runqueue

    wake_up_new_task()
      select_task_rq() can select a new cpu
      __set_task_cpu
      post_init_entity_util_avg
        attach_task_cfs_rq()
      activate_task
        enqueue_task

This patch makes that happen.

Maybe-Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

---
 kernel/sched/core.c |   67 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 20 deletions(-)

Comments

Vincent Guittot June 17, 2016, 1:58 p.m. | #1
On 17 June 2016 at 14:01, Peter Zijlstra <peterz@infradead.org> wrote:
> From: Vincent Guittot <vincent.guittot@linaro.org>

>

> A new fair task is detached and attached from/to task_group with:

>

>   cgroup_post_fork()

>     ss->fork(child) := cpu_cgroup_fork()

>       sched_move_task()

>         task_move_group_fair()

>

> Which is wrong, because at this point in fork() the task isn't fully

> initialized and it cannot 'move' to another group, because its not

> attached to any group as yet.

>

> In fact, cpu_cgroup_fork needs a small part of sched_move_task so we

> can just call this small part directly instead sched_move_task. And

> the task doesn't really migrate because it is not yet attached so we

> need the sequence:

>

>   do_fork()

>     sched_fork()

>       __set_task_cpu()

>

>     cgroup_post_fork()

>       set_task_rq() # set task group and runqueue

>

>     wake_up_new_task()

>       select_task_rq() can select a new cpu

>       __set_task_cpu

>       post_init_entity_util_avg

>         attach_task_cfs_rq()

>       activate_task

>         enqueue_task

>

> This patch makes that happen.

>


With this patch and patch 1, the fork sequence looks correct in my test

> Maybe-Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>


You can remove the Maybe if you want

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

> ---

>  kernel/sched/core.c |   67 ++++++++++++++++++++++++++++++++++++----------------

>  1 file changed, 47 insertions(+), 20 deletions(-)

>

> --- a/kernel/sched/core.c

> +++ b/kernel/sched/core.c

> @@ -7743,27 +7743,17 @@ void sched_offline_group(struct task_gro

>         spin_unlock_irqrestore(&task_group_lock, flags);

>  }

>

> -/* change task's runqueue when it moves between groups.

> - *     The caller of this function should have put the task in its new group

> - *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

> - *     reflect its new group.

> +/*

> + * Set task's runqueue and group.

> + *

> + * In case of a move between group, we update src and dst group thanks to

> + * sched_class->task_move_group. Otherwise, we just need to set runqueue and

> + * group pointers. The task will be attached to the runqueue during its wake

> + * up.

>   */

> -void sched_move_task(struct task_struct *tsk)

> +static void sched_set_group(struct task_struct *tsk, bool move)

>  {

>         struct task_group *tg;

> -       int queued, running;

> -       struct rq_flags rf;

> -       struct rq *rq;

> -

> -       rq = task_rq_lock(tsk, &rf);

> -

> -       running = task_current(rq, tsk);

> -       queued = task_on_rq_queued(tsk);

> -

> -       if (queued)

> -               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);

> -       if (unlikely(running))

> -               put_prev_task(rq, tsk);

>

>         /*

>          * All callers are synchronized by task_rq_lock(); we do not use RCU

> @@ -7776,11 +7766,37 @@ void sched_move_task(struct task_struct

>         tsk->sched_task_group = tg;

>

>  #ifdef CONFIG_FAIR_GROUP_SCHED

> -       if (tsk->sched_class->task_move_group)

> +       if (move && tsk->sched_class->task_move_group)

>                 tsk->sched_class->task_move_group(tsk);

>         else

>  #endif

>                 set_task_rq(tsk, task_cpu(tsk));

> +}

> +

> +/*

> + * Change task's runqueue when it moves between groups.

> + *

> + * The caller of this function should have put the task in its new group by

> + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect

> + * its new group.

> + */

> +void sched_move_task(struct task_struct *tsk)

> +{

> +       int queued, running;

> +       struct rq_flags rf;

> +       struct rq *rq;

> +

> +       rq = task_rq_lock(tsk, &rf);

> +

> +       running = task_current(rq, tsk);

> +       queued = task_on_rq_queued(tsk);

> +

> +       if (queued)

> +               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);

> +       if (unlikely(running))

> +               put_prev_task(rq, tsk);

> +

> +       sched_set_group(tsk, true);

>

>         if (unlikely(running))

>                 tsk->sched_class->set_curr_task(rq);

> @@ -8208,9 +8224,20 @@ static void cpu_cgroup_css_free(struct c

>         sched_free_group(tg);

>  }

>

> +/*

> + * This is called before wake_up_new_task(), therefore we really only

> + * have to set its group bits, all the other stuff does not apply.

> + */

>  static void cpu_cgroup_fork(struct task_struct *task)

>  {

> -       sched_move_task(task);

> +       struct rq_flags rf;

> +       struct rq *rq;

> +

> +       rq = task_rq_lock(task, &rf);

> +

> +       sched_set_group(task, false);

> +

> +       task_rq_unlock(rq, task, &rf);

>  }

>

>  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)

>

>

Patch

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7743,27 +7743,17 @@  void sched_offline_group(struct task_gro
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
-/* change task's runqueue when it moves between groups.
- *	The caller of this function should have put the task in its new group
- *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *	reflect its new group.
+/*
+ * Set task's runqueue and group.
+ *
+ * In case of a move between group, we update src and dst group thanks to
+ * sched_class->task_move_group. Otherwise, we just need to set runqueue and
+ * group pointers. The task will be attached to the runqueue during its wake
+ * up.
  */
-void sched_move_task(struct task_struct *tsk)
+static void sched_set_group(struct task_struct *tsk, bool move)
 {
 	struct task_group *tg;
-	int queued, running;
-	struct rq_flags rf;
-	struct rq *rq;
-
-	rq = task_rq_lock(tsk, &rf);
-
-	running = task_current(rq, tsk);
-	queued = task_on_rq_queued(tsk);
-
-	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-	if (unlikely(running))
-		put_prev_task(rq, tsk);
 
 	/*
 	 * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7776,11 +7766,37 @@  void sched_move_task(struct task_struct
 	tsk->sched_task_group = tg;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->task_move_group)
+	if (move && tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+	int queued, running;
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(tsk, &rf);
+
+	running = task_current(rq, tsk);
+	queued = task_on_rq_queued(tsk);
+
+	if (queued)
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+	if (unlikely(running))
+		put_prev_task(rq, tsk);
+
+	sched_set_group(tsk, true);
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
@@ -8208,9 +8224,20 @@  static void cpu_cgroup_css_free(struct c
 	sched_free_group(tg);
 }
 
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
 static void cpu_cgroup_fork(struct task_struct *task)
 {
-	sched_move_task(task);
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(task, &rf);
+
+	sched_set_group(task, false);
+
+	task_rq_unlock(rq, task, &rf);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)