sched/fair: Fix ftq noise bench regression

Message ID 1489758442-2877-1-git-send-email-vincent.guittot@linaro.org
State New
Headers show

Commit Message

Vincent Guittot March 17, 2017, 1:47 p.m.
A regression of the ftq noise has been reported by ying.huang@linux.intel.com
on 8 threads Intel(R) Core(TM)i7-4770 CPU @ 3.40GHz with 8G memory due to 

  commit 4e5160766fcc ("sched/fair: Propagate asynchrous detach")

The only part of the patch that can increase the noise is the update
of blocked load of group entity in update_blocked_averages().
We can optimize this call and skip the update of group entity if its load
and utilization are already null and there is no pending propagation of load
in the task group.

This optimization partly restores the noise score. A more agressive
optimization has been tried but has shown worse score.

Reported-by: ying.huang@linux.intel.com
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach")

---
 kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

-- 
2.7.4

Comments

Dietmar Eggemann March 21, 2017, 5:46 p.m. | #1
Hi Vincent,

On 17/03/17 13:47, Vincent Guittot wrote:

[...]

> Reported-by: ying.huang@linux.intel.com

> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

> Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach")


I thought I can see a difference by running:

 perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l 
 5000 

on an Ubuntu 16.10 server system.

Number of entries in the rq->leaf_cfs_rq_list per cpu: ~40

Target: Intel i5-3320M (4 logical cpus)

tip/sched/core: 42.119140365 seconds time elapsed ( +-  0.33% )

+ patch       : 42.089557108 seconds time elapsed ( +-  0.37% )

Maybe I need a CPU with more 'logical cpus' or a different test case.
Anyway, couldn't spot any regression.

> ---

>  kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++---

>  1 file changed, 36 insertions(+), 3 deletions(-)

> 

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 2805bd7..007df59 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -3173,6 +3173,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>  	return 1;

>  }

>  

> +/*

> + * Check if we need to update the load and the utilization of a blocked

> + * group_entity

> + */

> +static inline bool skip_blocked_update(struct sched_entity *se)

> +{

> +	struct cfs_rq *gcfs_rq = group_cfs_rq(se);

> +

> +	/*

> +	 * If sched_entity still have not null load or utilization, we have to

> +	 * decay it.

> +	 */

> +	if (se->avg.load_avg || se->avg.util_avg)

> +		return false;

> +

> +	/*

> +	 * If there is a pending propagation, we have to update the load and

> +	 * the utilizaion of the sched_entity


nit pick: s/utilizaion/utilization

> +	 */

> +	if (gcfs_rq->propagate_avg)

> +		return false;

> +

> +	/*

> +	 * Other wise, the load and the utilization of the sched_entity is

> +	 * already null and there is no pending propagation so it will be a

> +	 * waste of time to try to decay it.

> +	 */

> +	return true;

> +}

> +

>  #else /* CONFIG_FAIR_GROUP_SCHED */

>  

>  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}

> @@ -6961,6 +6991,8 @@ static void update_blocked_averages(int cpu)

>  	 * list_add_leaf_cfs_rq() for details.

>  	 */

>  	for_each_leaf_cfs_rq(rq, cfs_rq) {

> +		struct sched_entity *se;

> +

>  		/* throttled entities do not contribute to load */

>  		if (throttled_hierarchy(cfs_rq))

>  			continue;

> @@ -6968,9 +7000,10 @@ static void update_blocked_averages(int cpu)

>  		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))

>  			update_tg_load_avg(cfs_rq, 0);

>  

> -		/* Propagate pending load changes to the parent */

> -		if (cfs_rq->tg->se[cpu])

> -			update_load_avg(cfs_rq->tg->se[cpu], 0);

> +		/* Propagate pending load changes to the parent if any */

> +		se = cfs_rq->tg->se[cpu];

> +		if (se && !skip_blocked_update(se))

> +			update_load_avg(se, 0);

>  	}

>  	rq_unlock_irqrestore(rq, &rf);

>  }

> 


Why not turn skip_blocked_update(se) into blocked_update_needed(cpu, cfs_rq)?
Saves a couple of patch lines:diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 007df5953d1a..8001eeb4ec18 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3177,30 +3177,34 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
  * Check if we need to update the load and the utilization of a blocked
  * group_entity
  */
-static inline bool skip_blocked_update(struct sched_entity *se)
+static inline bool blocked_update_needed(int cpu, struct cfs_rq *cfs_rq)
 {
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+       struct sched_entity *se = cfs_rq->tg->se[cpu];
+
+       /* cfs_rq of a root task_group has no sched_entity counterpart */
+       if (!se)
+               return false;
 
        /*
         * If sched_entity still have not null load or utilization, we have to
         * decay it.
         */
        if (se->avg.load_avg || se->avg.util_avg)
-               return false;
+               return true;
 
        /*
         * If there is a pending propagation, we have to update the load and
         * the utilizaion of the sched_entity
         */
-       if (gcfs_rq->propagate_avg)
-               return false;
+       if (cfs_rq->propagate_avg)
+               return true;
 
        /*
         * Other wise, the load and the utilization of the sched_entity is
         * already null and there is no pending propagation so it will be a
         * waste of time to try to decay it.
         */
-       return true;
+       return false;
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
@@ -6991,8 +6995,6 @@ static void update_blocked_averages(int cpu)
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct sched_entity *se;
-
                /* throttled entities do not contribute to load */
                if (throttled_hierarchy(cfs_rq))
                        continue;
@@ -7001,9 +7003,8 @@ static void update_blocked_averages(int cpu)
                        update_tg_load_avg(cfs_rq, 0);
 
                /* Propagate pending load changes to the parent if any */
-               se = cfs_rq->tg->se[cpu];
-               if (se && !skip_blocked_update(se))
-                       update_load_avg(se, 0);
+               if (blocked_update_needed(cpu, cfs_rq))
+                       update_load_avg(cfs_rq->tg->se[cpu], 0);
        }
        rq_unlock_irqrestore(rq, &rf);
 }

Vincent Guittot March 22, 2017, 9:22 a.m. | #2
On 21 March 2017 at 18:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> Hi Vincent,

>

> On 17/03/17 13:47, Vincent Guittot wrote:

>

> [...]

>

>> Reported-by: ying.huang@linux.intel.com

>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

>> Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach")

>

> I thought I can see a difference by running:

>

>  perf stat --null --repeat 10 -- perf bench sched messaging -g 50 -l

>  5000

>

> on an Ubuntu 16.10 server system.

>

> Number of entries in the rq->leaf_cfs_rq_list per cpu: ~40

>

> Target: Intel i5-3320M (4 logical cpus)

>

> tip/sched/core: 42.119140365 seconds time elapsed ( +-  0.33% )

>

> + patch       : 42.089557108 seconds time elapsed ( +-  0.37% )

>

> Maybe I need a CPU with more 'logical cpus' or a different test case.

> Anyway, couldn't spot any regression.


ok. I haven't been able to reproduce the regression on my platforms too

>

>> ---

>>  kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++---

>>  1 file changed, 36 insertions(+), 3 deletions(-)

>>

>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

>> index 2805bd7..007df59 100644

>> --- a/kernel/sched/fair.c

>> +++ b/kernel/sched/fair.c

>> @@ -3173,6 +3173,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>>       return 1;

>>  }

>>

>> +/*

>> + * Check if we need to update the load and the utilization of a blocked

>> + * group_entity

>> + */

>> +static inline bool skip_blocked_update(struct sched_entity *se)

>> +{

>> +     struct cfs_rq *gcfs_rq = group_cfs_rq(se);

>> +

>> +     /*

>> +      * If sched_entity still have not null load or utilization, we have to

>> +      * decay it.

>> +      */

>> +     if (se->avg.load_avg || se->avg.util_avg)

>> +             return false;

>> +

>> +     /*

>> +      * If there is a pending propagation, we have to update the load and

>> +      * the utilizaion of the sched_entity

>

> nit pick: s/utilizaion/utilization


yes


>

>> +      */

>> +     if (gcfs_rq->propagate_avg)

>> +             return false;

>> +

>> +     /*

>> +      * Other wise, the load and the utilization of the sched_entity is

>> +      * already null and there is no pending propagation so it will be a

>> +      * waste of time to try to decay it.

>> +      */

>> +     return true;

>> +}

>> +

>>  #else /* CONFIG_FAIR_GROUP_SCHED */

>>

>>  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}

>> @@ -6961,6 +6991,8 @@ static void update_blocked_averages(int cpu)

>>        * list_add_leaf_cfs_rq() for details.

>>        */

>>       for_each_leaf_cfs_rq(rq, cfs_rq) {

>> +             struct sched_entity *se;

>> +

>>               /* throttled entities do not contribute to load */

>>               if (throttled_hierarchy(cfs_rq))

>>                       continue;

>> @@ -6968,9 +7000,10 @@ static void update_blocked_averages(int cpu)

>>               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))

>>                       update_tg_load_avg(cfs_rq, 0);

>>

>> -             /* Propagate pending load changes to the parent */

>> -             if (cfs_rq->tg->se[cpu])

>> -                     update_load_avg(cfs_rq->tg->se[cpu], 0);

>> +             /* Propagate pending load changes to the parent if any */

>> +             se = cfs_rq->tg->se[cpu];

>> +             if (se && !skip_blocked_update(se))

>> +                     update_load_avg(se, 0);

>>       }

>>       rq_unlock_irqrestore(rq, &rf);

>>  }

>>

>

> Why not turn skip_blocked_update(se) into blocked_update_needed(cpu, cfs_rq)?

> Saves a couple of patch lines:


Are you sure that we are saving some patch lines ?

I tend to agree on the name and but not on parameters.
IMO, it's easier to understand the purpose of
blocked_update_needed(se) compared to blocked_update_needed(cpu,
cfs_rq)


>

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

> index 007df5953d1a..8001eeb4ec18 100644

> --- a/kernel/sched/fair.c

> +++ b/kernel/sched/fair.c

> @@ -3177,30 +3177,34 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)

>   * Check if we need to update the load and the utilization of a blocked

>   * group_entity

>   */

> -static inline bool skip_blocked_update(struct sched_entity *se)

> +static inline bool blocked_update_needed(int cpu, struct cfs_rq *cfs_rq)

>  {

> -       struct cfs_rq *gcfs_rq = group_cfs_rq(se);

> +       struct sched_entity *se = cfs_rq->tg->se[cpu];

> +

> +       /* cfs_rq of a root task_group has no sched_entity counterpart */

> +       if (!se)

> +               return false;

>

>         /*

>          * If sched_entity still have not null load or utilization, we have to

>          * decay it.

>          */

>         if (se->avg.load_avg || se->avg.util_avg)

> -               return false;

> +               return true;

>

>         /*

>          * If there is a pending propagation, we have to update the load and

>          * the utilizaion of the sched_entity

>          */

> -       if (gcfs_rq->propagate_avg)

> -               return false;

> +       if (cfs_rq->propagate_avg)

> +               return true;

>

>         /*

>          * Other wise, the load and the utilization of the sched_entity is

>          * already null and there is no pending propagation so it will be a

>          * waste of time to try to decay it.

>          */

> -       return true;

> +       return false;

>  }

>

>  #else /* CONFIG_FAIR_GROUP_SCHED */

> @@ -6991,8 +6995,6 @@ static void update_blocked_averages(int cpu)

>          * list_add_leaf_cfs_rq() for details.

>          */

>         for_each_leaf_cfs_rq(rq, cfs_rq) {

> -               struct sched_entity *se;

> -

>                 /* throttled entities do not contribute to load */

>                 if (throttled_hierarchy(cfs_rq))

>                         continue;

> @@ -7001,9 +7003,8 @@ static void update_blocked_averages(int cpu)

>                         update_tg_load_avg(cfs_rq, 0);

>

>                 /* Propagate pending load changes to the parent if any */

> -               se = cfs_rq->tg->se[cpu];

> -               if (se && !skip_blocked_update(se))

> -                       update_load_avg(se, 0);

> +               if (blocked_update_needed(cpu, cfs_rq))

> +                       update_load_avg(cfs_rq->tg->se[cpu], 0);

>         }

>         rq_unlock_irqrestore(rq, &rf);

>  }

>

>

>

>
Dietmar Eggemann March 22, 2017, 4:22 p.m. | #3
On 22/03/17 09:22, Vincent Guittot wrote:
> On 21 March 2017 at 18:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>> Hi Vincent,

>>

>> On 17/03/17 13:47, Vincent Guittot wrote:

>>

>> [...]

>>

>>> Reported-by: ying.huang@linux.intel.com

>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

>>> Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach")


[...]

>> Why not turn skip_blocked_update(se) into blocked_update_needed(cpu, cfs_rq)?

>> Saves a couple of patch lines:

> 

> Are you sure that we are saving some patch lines ?


Sorry, it's actually the same :-)

> 

> I tend to agree on the name and but not on parameters.

> IMO, it's easier to understand the purpose of

> blocked_update_needed(se) compared to blocked_update_needed(cpu,

> cfs_rq)


OK, so:

-               /* Propagate pending load changes to the parent */
-               if (cfs_rq->tg->se[cpu])
+               /* Propagate pending load changes to the parent if any */
+               if (blocked_update_needed(cfs_rq->tg->se[cpu]))

and

+static inline bool blocked_update_needed(struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+       /* cfs_rq of a root task_group has no sched_entity counterpart */
+       if (!se)
+               return false;
+
+       /*
+        * If sched_entity still have not null load or utilization, we
have to
+        * decay it.
+        */
....

Would make sense to me ...
Vincent Guittot March 22, 2017, 4:55 p.m. | #4
On 22 March 2017 at 17:22, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 22/03/17 09:22, Vincent Guittot wrote:

>> On 21 March 2017 at 18:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>>> Hi Vincent,

>>>

>>> On 17/03/17 13:47, Vincent Guittot wrote:

>>>

>>> [...]

>>>

>>>> Reported-by: ying.huang@linux.intel.com

>>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>

>>>> Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach")

>

> [...]

>

>>> Why not turn skip_blocked_update(se) into blocked_update_needed(cpu, cfs_rq)?

>>> Saves a couple of patch lines:

>>

>> Are you sure that we are saving some patch lines ?

>

> Sorry, it's actually the same :-)

>

>>

>> I tend to agree on the name and but not on parameters.

>> IMO, it's easier to understand the purpose of

>> blocked_update_needed(se) compared to blocked_update_needed(cpu,

>> cfs_rq)

>

> OK, so:

>

> -               /* Propagate pending load changes to the parent */

> -               if (cfs_rq->tg->se[cpu])

> +               /* Propagate pending load changes to the parent if any */

> +               if (blocked_update_needed(cfs_rq->tg->se[cpu]))

>

> and

>

> +static inline bool blocked_update_needed(struct sched_entity *se)

> +{

> +       struct cfs_rq *gcfs_rq = group_cfs_rq(se);


gcfs_rq can't be set before testing that se is not null

> +

> +       /* cfs_rq of a root task_group has no sched_entity counterpart */

> +       if (!se)

> +               return false;

> +

> +       /*

> +        * If sched_entity still have not null load or utilization, we

> have to

> +        * decay it.

> +        */

> ....

>

> Would make sense to me ...
Dietmar Eggemann March 22, 2017, 5:22 p.m. | #5
On 22/03/17 16:55, Vincent Guittot wrote:
> On 22 March 2017 at 17:22, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>> On 22/03/17 09:22, Vincent Guittot wrote:

>>> On 21 March 2017 at 18:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:

>>>> Hi Vincent,

>>>>

>>>> On 17/03/17 13:47, Vincent Guittot wrote:


[...]

>> +static inline bool blocked_update_needed(struct sched_entity *se)

>> +{

>> +       struct cfs_rq *gcfs_rq = group_cfs_rq(se);

> 

> gcfs_rq can't be set before testing that se is not null


Ah correct, but you could set it after the !se check.

Patch hide | download patch | download mbox

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2805bd7..007df59 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3173,6 +3173,36 @@  static inline int propagate_entity_load_avg(struct sched_entity *se)
 	return 1;
 }
 
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+	/*
+	 * If sched_entity still have not null load or utilization, we have to
+	 * decay it.
+	 */
+	if (se->avg.load_avg || se->avg.util_avg)
+		return false;
+
+	/*
+	 * If there is a pending propagation, we have to update the load and
+	 * the utilizaion of the sched_entity
+	 */
+	if (gcfs_rq->propagate_avg)
+		return false;
+
+	/*
+	 * Other wise, the load and the utilization of the sched_entity is
+	 * already null and there is no pending propagation so it will be a
+	 * waste of time to try to decay it.
+	 */
+	return true;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -6961,6 +6991,8 @@  static void update_blocked_averages(int cpu)
 	 * list_add_leaf_cfs_rq() for details.
 	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct sched_entity *se;
+
 		/* throttled entities do not contribute to load */
 		if (throttled_hierarchy(cfs_rq))
 			continue;
@@ -6968,9 +7000,10 @@  static void update_blocked_averages(int cpu)
 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
 			update_tg_load_avg(cfs_rq, 0);
 
-		/* Propagate pending load changes to the parent */
-		if (cfs_rq->tg->se[cpu])
-			update_load_avg(cfs_rq->tg->se[cpu], 0);
+		/* Propagate pending load changes to the parent if any */
+		se = cfs_rq->tg->se[cpu];
+		if (se && !skip_blocked_update(se))
+			update_load_avg(se, 0);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }