diff mbox

[v4,5/5] sched: ARM: create a dedicated scheduler topology table

Message ID 5357A802.1030804@arm.com
State New
Headers show

Commit Message

Dietmar Eggemann April 23, 2014, 11:46 a.m. UTC
Hi,

I'm trying to use this approach of specifying different per-cpu views on
sd flags on DIE level on a TC2 platform (cluster 0 w/ CPU0/1 and cluster
1 w/ CPU2/3/4 w/o SMT). It doesn't work like in the case for the GMC/MC
sd level.

If I use the following patch (just to illustrate the issue) on top
(returning SD_SHARE_POWERDOMAIN in cpu_cpupower_flags() is bogus ... I
just needed a flag function for GDIE level):


so I get the following topology:

CPU0: cpu_corepower_mask=0   (GMC)
CPU0: cpu_coregroup_mask=0-1 (MC)
CPU0:  cpu_cpupower_mask=0-1 (GDIE)
CPU0:       cpu_cpu_mask=0-4 (DIE)
CPU1: cpu_corepower_mask=1    ...
CPU1: cpu_coregroup_mask=0-1
CPU1:  cpu_cpupower_mask=0-1
CPU1:       cpu_cpu_mask=0-4
CPU2: cpu_corepower_mask=2
CPU2: cpu_coregroup_mask=2-4
CPU2:  cpu_cpupower_mask=0-4
CPU2:       cpu_cpu_mask=0-4
CPU3: cpu_corepower_mask=3
CPU3: cpu_coregroup_mask=2-4
CPU3:  cpu_cpupower_mask=0-4
CPU3:       cpu_cpu_mask=0-4
CPU4: cpu_corepower_mask=4
CPU4: cpu_coregroup_mask=2-4
CPU4:  cpu_cpupower_mask=0-4
CPU4:       cpu_cpu_mask=0-4

Firstly, I had to get rid of the cpumask_equal(cpu_map,
sched_domain_span(sd)) condition in build_sched_domains() to allow that
I can have two sd levels which span CPU 0-4 (for CPU2/3/4).

But it still doesn't work correctly:

dmesg snippet 2:

CPU0 attaching sched-domain:
 domain 0: span 0-1 level MC
  groups: 0 1
  domain 1: span 0-4 level DIE     <-- error (there's only one group)
   groups: 0-4 (cpu_power = 2048)
...
CPU2 attaching sched-domain:
 domain 0: span 2-4 level MC
  groups: 2 3 4
  domain 1: span 0-4 level GDIE
ERROR: domain->groups does not contain CPU2
   groups:
ERROR: domain->cpu_power not set

ERROR: groups don't span domain->span
...

It turns out that the function get_group() which is used a couple of
times in build_sched_groups() uses a reference to sd->child and even
though the degenerate functionality gets rid of GDIE for CPU0/1 and DIE
for CPU2/3/4 the group set-up doesn't work as expected since sd->child
for DIE is GDIE and not MC any more.
So it looks like GMC/MC level is somehow special (GMC has no sd->child
for TC2 or GMC/MC contains only one cpu per group?).

Although this problem does not effect the current patch-set, people
might think that they can apply this degenerate trick for other sd
levels as well.

I'm trying to fix get_group()/build_sched_groups() in such a way that my
example would work but so far I haven't succeeded. The question for me
remains ... is this application of the degenerate trick feasible at all
in all sd levels, i.e. does it scale? What about platforms w/ SMT sd
level which want to use this trick in GMC/MC level?

Any hints are highly appreciated here.

-- Dietmar

On 11/04/14 10:44, Vincent Guittot wrote:
> Create a dedicated topology table for ARM which will create new level to
> differentiate CPUs that can or not powergate independantly from others.
> 
> The patch gives an example of how to add domain that will take advantage of
> SD_SHARE_POWERDOMAIN.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  arch/arm/kernel/topology.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
> index 0bc94b1..71e1fec 100644
> --- a/arch/arm/kernel/topology.c
> +++ b/arch/arm/kernel/topology.c
> @@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
>  	return &cpu_topology[cpu].core_sibling;
>  }
>  
> +/*
> + * The current assumption is that we can power gate each core independently.
> + * This will be superseded by DT binding once available.
> + */
> +const struct cpumask *cpu_corepower_mask(int cpu)
> +{
> +	return &cpu_topology[cpu].thread_sibling;
> +}
> +
>  static void update_siblings_masks(unsigned int cpuid)
>  {
>  	struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
> @@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
>  		cpu_topology[cpuid].socket_id, mpidr);
>  }
>  
> +static inline const int cpu_corepower_flags(void)
> +{
> +	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
> +}
> +
> +static struct sched_domain_topology_level arm_topology[] = {
> +#ifdef CONFIG_SCHED_MC
> +	{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
> +	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
> +#endif
> +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +	{ NULL, },
> +};
> +
>  /*
>   * init_cpu_topology is called at boot when only one cpu is running
>   * which prevent simultaneous write access to cpu_topology array
> @@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
>  	smp_wmb();
>  
>  	parse_dt_topology();
> +
> +	/* Set scheduler topology descriptor */
> +	set_sched_topology(arm_topology);
>  }
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Comments

Vincent Guittot April 23, 2014, 2:46 p.m. UTC | #1
On 23 April 2014 13:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> Hi,
>
> I'm trying to use this approach of specifying different per-cpu views on
> sd flags on DIE level on a TC2 platform (cluster 0 w/ CPU0/1 and cluster
> 1 w/ CPU2/3/4 w/o SMT). It doesn't work like in the case for the GMC/MC
> sd level.
>
> If I use the following patch (just to illustrate the issue) on top
> (returning SD_SHARE_POWERDOMAIN in cpu_cpupower_flags() is bogus ... I
> just needed a flag function for GDIE level):
>
> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
> index 71e1fec6d31a..803330d89c09 100644
> --- a/arch/arm/kernel/topology.c
> +++ b/arch/arm/kernel/topology.c
> @@ -194,6 +194,14 @@ const struct cpumask *cpu_corepower_mask(int cpu)
>         return &cpu_topology[cpu].thread_sibling;
>  }
>
> +const struct cpumask *cpu_cpupower_mask(int cpu)
> +{
> +       return cpu_topology[cpu].socket_id ?
> +                       cpumask_of_node(cpu_to_node(cpu)) :
> +                       &cpu_topology[cpu].core_sibling;
> +}
> +
> +
>  static void update_siblings_masks(unsigned int cpuid)
>  {
>         struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
> @@ -280,11 +288,18 @@ static inline const int cpu_corepower_flags(void)
>         return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
>  }
>
> +static inline const int cpu_cpupower_flags(void)
> +{
> +       return SD_SHARE_POWERDOMAIN;
> +}
> +
> +
>  static struct sched_domain_topology_level arm_topology[] = {
>  #ifdef CONFIG_SCHED_MC
>         { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
>         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
>  #endif
> +       { cpu_cpupower_mask, cpu_cpupower_flags, SD_INIT_NAME(GDIE) },
>         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>         { NULL, },
>  };
>
> so I get the following topology:
>
> CPU0: cpu_corepower_mask=0   (GMC)
> CPU0: cpu_coregroup_mask=0-1 (MC)
> CPU0:  cpu_cpupower_mask=0-1 (GDIE)
> CPU0:       cpu_cpu_mask=0-4 (DIE)
> CPU1: cpu_corepower_mask=1    ...
> CPU1: cpu_coregroup_mask=0-1
> CPU1:  cpu_cpupower_mask=0-1
> CPU1:       cpu_cpu_mask=0-4
> CPU2: cpu_corepower_mask=2
> CPU2: cpu_coregroup_mask=2-4
> CPU2:  cpu_cpupower_mask=0-4
> CPU2:       cpu_cpu_mask=0-4
> CPU3: cpu_corepower_mask=3
> CPU3: cpu_coregroup_mask=2-4
> CPU3:  cpu_cpupower_mask=0-4
> CPU3:       cpu_cpu_mask=0-4
> CPU4: cpu_corepower_mask=4
> CPU4: cpu_coregroup_mask=2-4
> CPU4:  cpu_cpupower_mask=0-4
> CPU4:       cpu_cpu_mask=0-4

You  have an inconsistency in your topology description:

At GDIE level:
CPU1 cpu_cpupower_mask=0-1
but
CPU2 cpu_cpupower_mask=0-4

so CPU2 says that it shares power domain with CPU0 but CPU1 says the opposite

Regards
Vincent

>
> Firstly, I had to get rid of the cpumask_equal(cpu_map,
> sched_domain_span(sd)) condition in build_sched_domains() to allow that
> I can have two sd levels which span CPU 0-4 (for CPU2/3/4).
>
> But it still doesn't work correctly:
>
> dmesg snippet 2:
>
> CPU0 attaching sched-domain:
>  domain 0: span 0-1 level MC
>   groups: 0 1
>   domain 1: span 0-4 level DIE     <-- error (there's only one group)
>    groups: 0-4 (cpu_power = 2048)
> ...
> CPU2 attaching sched-domain:
>  domain 0: span 2-4 level MC
>   groups: 2 3 4
>   domain 1: span 0-4 level GDIE
> ERROR: domain->groups does not contain CPU2
>    groups:
> ERROR: domain->cpu_power not set
>
> ERROR: groups don't span domain->span
> ...
>
> It turns out that the function get_group() which is used a couple of
> times in build_sched_groups() uses a reference to sd->child and even
> though the degenerate functionality gets rid of GDIE for CPU0/1 and DIE
> for CPU2/3/4 the group set-up doesn't work as expected since sd->child
> for DIE is GDIE and not MC any more.
> So it looks like GMC/MC level is somehow special (GMC has no sd->child
> for TC2 or GMC/MC contains only one cpu per group?).
>
> Although this problem does not effect the current patch-set, people
> might think that they can apply this degenerate trick for other sd
> levels as well.
>
> I'm trying to fix get_group()/build_sched_groups() in such a way that my
> example would work but so far I haven't succeeded. The question for me
> remains ... is this application of the degenerate trick feasible at all
> in all sd levels, i.e. does it scale? What about platforms w/ SMT sd
> level which want to use this trick in GMC/MC level?
>
> Any hints are highly appreciated here.
>
> -- Dietmar
>
> On 11/04/14 10:44, Vincent Guittot wrote:
>> Create a dedicated topology table for ARM which will create new level to
>> differentiate CPUs that can or not powergate independantly from others.
>>
>> The patch gives an example of how to add domain that will take advantage of
>> SD_SHARE_POWERDOMAIN.
>>
>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>> ---
>>  arch/arm/kernel/topology.c | 26 ++++++++++++++++++++++++++
>>  1 file changed, 26 insertions(+)
>>
>> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
>> index 0bc94b1..71e1fec 100644
>> --- a/arch/arm/kernel/topology.c
>> +++ b/arch/arm/kernel/topology.c
>> @@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
>>       return &cpu_topology[cpu].core_sibling;
>>  }
>>
>> +/*
>> + * The current assumption is that we can power gate each core independently.
>> + * This will be superseded by DT binding once available.
>> + */
>> +const struct cpumask *cpu_corepower_mask(int cpu)
>> +{
>> +     return &cpu_topology[cpu].thread_sibling;
>> +}
>> +
>>  static void update_siblings_masks(unsigned int cpuid)
>>  {
>>       struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
>> @@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
>>               cpu_topology[cpuid].socket_id, mpidr);
>>  }
>>
>> +static inline const int cpu_corepower_flags(void)
>> +{
>> +     return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
>> +}
>> +
>> +static struct sched_domain_topology_level arm_topology[] = {
>> +#ifdef CONFIG_SCHED_MC
>> +     { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
>> +     { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
>> +#endif
>> +     { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>> +     { NULL, },
>> +};
>> +
>>  /*
>>   * init_cpu_topology is called at boot when only one cpu is running
>>   * which prevent simultaneous write access to cpu_topology array
>> @@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
>>       smp_wmb();
>>
>>       parse_dt_topology();
>> +
>> +     /* Set scheduler topology descriptor */
>> +     set_sched_topology(arm_topology);
>>  }
>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Dietmar Eggemann April 23, 2014, 3:26 p.m. UTC | #2
On 23/04/14 15:46, Vincent Guittot wrote:
> On 23 April 2014 13:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>> Hi,
>>
>> I'm trying to use this approach of specifying different per-cpu views on
>> sd flags on DIE level on a TC2 platform (cluster 0 w/ CPU0/1 and cluster
>> 1 w/ CPU2/3/4 w/o SMT). It doesn't work like in the case for the GMC/MC
>> sd level.
>>
>> If I use the following patch (just to illustrate the issue) on top
>> (returning SD_SHARE_POWERDOMAIN in cpu_cpupower_flags() is bogus ... I
>> just needed a flag function for GDIE level):
>>
>> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
>> index 71e1fec6d31a..803330d89c09 100644
>> --- a/arch/arm/kernel/topology.c
>> +++ b/arch/arm/kernel/topology.c
>> @@ -194,6 +194,14 @@ const struct cpumask *cpu_corepower_mask(int cpu)
>>         return &cpu_topology[cpu].thread_sibling;
>>  }
>>
>> +const struct cpumask *cpu_cpupower_mask(int cpu)
>> +{
>> +       return cpu_topology[cpu].socket_id ?
>> +                       cpumask_of_node(cpu_to_node(cpu)) :
>> +                       &cpu_topology[cpu].core_sibling;
>> +}
>> +
>> +
>>  static void update_siblings_masks(unsigned int cpuid)
>>  {
>>         struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
>> @@ -280,11 +288,18 @@ static inline const int cpu_corepower_flags(void)
>>         return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
>>  }
>>
>> +static inline const int cpu_cpupower_flags(void)
>> +{
>> +       return SD_SHARE_POWERDOMAIN;
>> +}
>> +
>> +
>>  static struct sched_domain_topology_level arm_topology[] = {
>>  #ifdef CONFIG_SCHED_MC
>>         { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
>>         { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
>>  #endif
>> +       { cpu_cpupower_mask, cpu_cpupower_flags, SD_INIT_NAME(GDIE) },
>>         { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>>         { NULL, },
>>  };
>>
>> so I get the following topology:
>>
>> CPU0: cpu_corepower_mask=0   (GMC)
>> CPU0: cpu_coregroup_mask=0-1 (MC)
>> CPU0:  cpu_cpupower_mask=0-1 (GDIE)
>> CPU0:       cpu_cpu_mask=0-4 (DIE)
>> CPU1: cpu_corepower_mask=1    ...
>> CPU1: cpu_coregroup_mask=0-1
>> CPU1:  cpu_cpupower_mask=0-1
>> CPU1:       cpu_cpu_mask=0-4
>> CPU2: cpu_corepower_mask=2
>> CPU2: cpu_coregroup_mask=2-4
>> CPU2:  cpu_cpupower_mask=0-4
>> CPU2:       cpu_cpu_mask=0-4
>> CPU3: cpu_corepower_mask=3
>> CPU3: cpu_coregroup_mask=2-4
>> CPU3:  cpu_cpupower_mask=0-4
>> CPU3:       cpu_cpu_mask=0-4
>> CPU4: cpu_corepower_mask=4
>> CPU4: cpu_coregroup_mask=2-4
>> CPU4:  cpu_cpupower_mask=0-4
>> CPU4:       cpu_cpu_mask=0-4
> 
> You  have an inconsistency in your topology description:

That's true functional-wise but I think that this is not the reason why
the code in get_group()/build_sched_groups() isn't working correctly any
more with this set-up.

Like I said above the cpu_cpupower_flags() is just bogus and I only
added it to illustrate the issue (Shouldn't have used
SD_SHARE_POWERDOMAIN in the first place).
Essentially what I want to do is bind an SD_SHARE_*FOO* flag to the GDIE
related sd's of CPU2/3/4 and not to the DIE related sd's of CPU0/1.

I thought so far that I can achieve that by getting rid of GDIE sd level
for CPU0/1 simply by choosing the cpu_foo_mask() function in such a way
that it returns the same cpu mask as its child sd level (MC) and of DIE
sd level for CPU2/3/4 because it returns the same cpu mask as its child
sd level (GDIE) related cpu mask function. This will let sd degenerate
do it's job of folding sd levels which it does. The only problem I have
is that the groups are not created correctly any more.

I don't see right now how the flag SD_SHARE_FOO affects the code in
get_group()/build_sched_groups().

Think of SD_SHARE_FOO of something I would like to have for all sd's of
CPU's of cluster 1 (CPU2/3/4) and not on cluster 0 (CPU0/1) in the sd
level where each CPU sees two groups (group0 containing CPU0/1 and
group1 containing CPU2/3/4 or vice versa) (GDIE/DIE) .

-- Dietmar

> 
> At GDIE level:
> CPU1 cpu_cpupower_mask=0-1
> but
> CPU2 cpu_cpupower_mask=0-4
> 
> so CPU2 says that it shares power domain with CPU0 but CPU1 says the opposite
> 
> Regards
> Vincent
> 
>>
>> Firstly, I had to get rid of the cpumask_equal(cpu_map,
>> sched_domain_span(sd)) condition in build_sched_domains() to allow that
>> I can have two sd levels which span CPU 0-4 (for CPU2/3/4).
>>
>> But it still doesn't work correctly:
>>
>> dmesg snippet 2:
>>
>> CPU0 attaching sched-domain:
>>  domain 0: span 0-1 level MC
>>   groups: 0 1
>>   domain 1: span 0-4 level DIE     <-- error (there's only one group)
>>    groups: 0-4 (cpu_power = 2048)
>> ...
>> CPU2 attaching sched-domain:
>>  domain 0: span 2-4 level MC
>>   groups: 2 3 4
>>   domain 1: span 0-4 level GDIE
>> ERROR: domain->groups does not contain CPU2
>>    groups:
>> ERROR: domain->cpu_power not set
>>
>> ERROR: groups don't span domain->span
>> ...
>>
>> It turns out that the function get_group() which is used a couple of
>> times in build_sched_groups() uses a reference to sd->child and even
>> though the degenerate functionality gets rid of GDIE for CPU0/1 and DIE
>> for CPU2/3/4 the group set-up doesn't work as expected since sd->child
>> for DIE is GDIE and not MC any more.
>> So it looks like GMC/MC level is somehow special (GMC has no sd->child
>> for TC2 or GMC/MC contains only one cpu per group?).
>>
>> Although this problem does not effect the current patch-set, people
>> might think that they can apply this degenerate trick for other sd
>> levels as well.
>>
>> I'm trying to fix get_group()/build_sched_groups() in such a way that my
>> example would work but so far I haven't succeeded. The question for me
>> remains ... is this application of the degenerate trick feasible at all
>> in all sd levels, i.e. does it scale? What about platforms w/ SMT sd
>> level which want to use this trick in GMC/MC level?
>>
>> Any hints are highly appreciated here.
>>
>> -- Dietmar
>>
>> On 11/04/14 10:44, Vincent Guittot wrote:
>>> Create a dedicated topology table for ARM which will create new level to
>>> differentiate CPUs that can or not powergate independantly from others.
>>>
>>> The patch gives an example of how to add domain that will take advantage of
>>> SD_SHARE_POWERDOMAIN.
>>>
>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>> ---
>>>  arch/arm/kernel/topology.c | 26 ++++++++++++++++++++++++++
>>>  1 file changed, 26 insertions(+)
>>>
>>> diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
>>> index 0bc94b1..71e1fec 100644
>>> --- a/arch/arm/kernel/topology.c
>>> +++ b/arch/arm/kernel/topology.c
>>> @@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
>>>       return &cpu_topology[cpu].core_sibling;
>>>  }
>>>
>>> +/*
>>> + * The current assumption is that we can power gate each core independently.
>>> + * This will be superseded by DT binding once available.
>>> + */
>>> +const struct cpumask *cpu_corepower_mask(int cpu)
>>> +{
>>> +     return &cpu_topology[cpu].thread_sibling;
>>> +}
>>> +
>>>  static void update_siblings_masks(unsigned int cpuid)
>>>  {
>>>       struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
>>> @@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
>>>               cpu_topology[cpuid].socket_id, mpidr);
>>>  }
>>>
>>> +static inline const int cpu_corepower_flags(void)
>>> +{
>>> +     return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
>>> +}
>>> +
>>> +static struct sched_domain_topology_level arm_topology[] = {
>>> +#ifdef CONFIG_SCHED_MC
>>> +     { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
>>> +     { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
>>> +#endif
>>> +     { cpu_cpu_mask, SD_INIT_NAME(DIE) },
>>> +     { NULL, },
>>> +};
>>> +
>>>  /*
>>>   * init_cpu_topology is called at boot when only one cpu is running
>>>   * which prevent simultaneous write access to cpu_topology array
>>> @@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
>>>       smp_wmb();
>>>
>>>       parse_dt_topology();
>>> +
>>> +     /* Set scheduler topology descriptor */
>>> +     set_sched_topology(arm_topology);
>>>  }
>>>
>>
>>
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot April 24, 2014, 7:30 a.m. UTC | #3
On 23 April 2014 17:26, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 23/04/14 15:46, Vincent Guittot wrote:
>> On 23 April 2014 13:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>> Hi,
>>>

[snip]

>> You  have an inconsistency in your topology description:
>
> That's true functional-wise but I think that this is not the reason why
> the code in get_group()/build_sched_groups() isn't working correctly any
> more with this set-up.
>
> Like I said above the cpu_cpupower_flags() is just bogus and I only
> added it to illustrate the issue (Shouldn't have used
> SD_SHARE_POWERDOMAIN in the first place).

More than the flag that is used for the example, it's about the
cpumask which are inconsistent across CPUs for the same level and the
build_sched_domain sequence rely on this consistency to build
sched_group

> Essentially what I want to do is bind an SD_SHARE_*FOO* flag to the GDIE
> related sd's of CPU2/3/4 and not to the DIE related sd's of CPU0/1.
>
> I thought so far that I can achieve that by getting rid of GDIE sd level
> for CPU0/1 simply by choosing the cpu_foo_mask() function in such a way
> that it returns the same cpu mask as its child sd level (MC) and of DIE
> sd level for CPU2/3/4 because it returns the same cpu mask as its child
> sd level (GDIE) related cpu mask function. This will let sd degenerate
> do it's job of folding sd levels which it does. The only problem I have
> is that the groups are not created correctly any more.
>
> I don't see right now how the flag SD_SHARE_FOO affects the code in
> get_group()/build_sched_groups().
>
> Think of SD_SHARE_FOO of something I would like to have for all sd's of
> CPU's of cluster 1 (CPU2/3/4) and not on cluster 0 (CPU0/1) in the sd
> level where each CPU sees two groups (group0 containing CPU0/1 and
> group1 containing CPU2/3/4 or vice versa) (GDIE/DIE) .

I'm not sure that's it's feasible because it's not possible from a
topology pov to have different flags if the span include all cpus.
Could you give us more details about what you want to achieve with
this flag ?

Vincent

>
> -- Dietmar
>
>>
>> At GDIE level:
>> CPU1 cpu_cpupower_mask=0-1
>> but
>> CPU2 cpu_cpupower_mask=0-4
>>
>> so CPU2 says that it shares power domain with CPU0 but CPU1 says the opposite
>>
>> Regards
>> Vincent
>>
>>>
>>> Firstly, I had to get rid of the cpumask_equal(cpu_map,
>>> sched_domain_span(sd)) condition in build_sched_domains() to allow that
>>> I can have two sd levels which span CPU 0-4 (for CPU2/3/4).
>>>
>>> But it still doesn't work correctly:
>>>
>>> dmesg snippet 2:
>>>
>>> CPU0 attaching sched-domain:
>>>  domain 0: span 0-1 level MC
>>>   groups: 0 1
>>>   domain 1: span 0-4 level DIE     <-- error (there's only one group)
>>>    groups: 0-4 (cpu_power = 2048)
>>> ...
>>> CPU2 attaching sched-domain:
>>>  domain 0: span 2-4 level MC
>>>   groups: 2 3 4
>>>   domain 1: span 0-4 level GDIE
>>> ERROR: domain->groups does not contain CPU2
>>>    groups:
>>> ERROR: domain->cpu_power not set
>>>
>>> ERROR: groups don't span domain->span
>>> ...
>>>

[snip]

>>>>
>>>
>>>
>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Vincent Guittot April 25, 2014, 7:45 a.m. UTC | #4
On 24 April 2014 14:48, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 24/04/14 08:30, Vincent Guittot wrote:
>> On 23 April 2014 17:26, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>> On 23/04/14 15:46, Vincent Guittot wrote:
>>>> On 23 April 2014 13:46, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>>>>> Hi,
>
> [...]
>
>>
>> More than the flag that is used for the example, it's about the
>> cpumask which are inconsistent across CPUs for the same level and the
>> build_sched_domain sequence rely on this consistency to build
>> sched_group
>
> Now I'm lost here. I thought so far that by specifying different cpu
> masks per CPU in an sd level, we get the sd level folding functionality
> in sd degenerate?
>
> We discussed this here for an example on TC2 for the GMC level:
> https://lkml.org/lkml/2014/3/21/126
>
> Back than I had
>   CPU0: cpu_corepower_mask=0-1
>   CPU2: cpu_corepower_mask=2
> so for GMC level the cpumasks are inconsistent across CPUs and it worked.

The example above is consistent because CPU2 mask and CPU0 mask are
fully exclusive

so
CPU0: cpu_corepower_mask=0-1
CPU2: cpu_corepower_mask=2
are consistent

CPU0: cpu_corepower_mask=0-2
CPU2: cpu_corepower_mask=0-2
are also consistent

but

CPU0: cpu_corepower_mask=0-1
CPU2: cpu_corepower_mask=0-2
are not consistent

and your example uses the last configuration

To be more precise, the rule above applies on default SDT definition
but the flag SD_OVERLAP enables such kind of overlap between group.
Have you tried it ?

Vincent

>
> The header of '[PATCH v4 1/5] sched: rework of sched_domain topology
> definition' mentions only the requirement "Then, each level must be a
> subset on the next one" and this one I haven't broken w/ my
> GMC/MC/GDIE/DIE set-up.
>
> Do I miss something else here?
>
>>
>>> Essentially what I want to do is bind an SD_SHARE_*FOO* flag to the GDIE
>>> related sd's of CPU2/3/4 and not to the DIE related sd's of CPU0/1.
>>>
>>> I thought so far that I can achieve that by getting rid of GDIE sd level
>>> for CPU0/1 simply by choosing the cpu_foo_mask() function in such a way
>>> that it returns the same cpu mask as its child sd level (MC) and of DIE
>>> sd level for CPU2/3/4 because it returns the same cpu mask as its child
>>> sd level (GDIE) related cpu mask function. This will let sd degenerate
>>> do it's job of folding sd levels which it does. The only problem I have
>>> is that the groups are not created correctly any more.
>>>
>>> I don't see right now how the flag SD_SHARE_FOO affects the code in
>>> get_group()/build_sched_groups().
>>>
>>> Think of SD_SHARE_FOO of something I would like to have for all sd's of
>>> CPU's of cluster 1 (CPU2/3/4) and not on cluster 0 (CPU0/1) in the sd
>>> level where each CPU sees two groups (group0 containing CPU0/1 and
>>> group1 containing CPU2/3/4 or vice versa) (GDIE/DIE) .
>>
>> I'm not sure that's it's feasible because it's not possible from a
>> topology pov to have different flags if the span include all cpus.
>> Could you give us more details about what you want to achieve with
>> this flag ?
>
> IMHO, the flag is not important for this discussion.  OTHA, information
> like you can't use sd degenerate functionality to fold adjacent sd
> levels (GFOO/FOO) on sd level which span all CPUs would be.  I want to
> make sure we understand what are the limitations to use folding of
> adjacent sd levels based on per-cpu differences in the return value of
> cpu_mask functions.
>
> -- Dietmar
>
> [...]
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Dietmar Eggemann April 25, 2014, 3:55 p.m. UTC | #5
On 25/04/14 08:45, Vincent Guittot wrote:

[...]

>>
>> Back than I had
>>   CPU0: cpu_corepower_mask=0-1
>>   CPU2: cpu_corepower_mask=2
>> so for GMC level the cpumasks are inconsistent across CPUs and it worked.
> 
> The example above is consistent because CPU2 mask and CPU0 mask are
> fully exclusive

OK, got it now. The cpu mask functions on an sd level can return
different (but then exclusive) cpu masks or they all return the same cpu
mask (DIE level in example). Like you said we still have to respect the
topology of the system.

This essentially excludes the DIE level (i.e. the sd level which spawns
all CPUs) from playing this 'sd level folding via sd degenerate' game
for a system which specifies FORCE_SD_OVERLAP to false or don't use this
SDTL_OVERLAP tl flag.

> 
> so
> CPU0: cpu_corepower_mask=0-1
> CPU2: cpu_corepower_mask=2
> are consistent
> 
> CPU0: cpu_corepower_mask=0-2
> CPU2: cpu_corepower_mask=0-2
> are also consistent
> 
> but
> 
> CPU0: cpu_corepower_mask=0-1
> CPU2: cpu_corepower_mask=0-2
> are not consistent
> 
> and your example uses the last configuration
> 
> To be more precise, the rule above applies on default SDT definition
> but the flag SD_OVERLAP enables such kind of overlap between group.
> Have you tried it ?

Setting FORCE_SD_OVERLAP indeed changes the scenario a bit (we're now
using build_overlap_sched_groups() instead of build_sched_groups()). It
looks better, but the groups for CPU0/1 in DIE level are wrong (to get
so far I still have to comment out the check that 'if cpu_map is equal
to sd span of sd then break' in build_sched_domains() though).

dmesg snippet:

 CPU0 attaching sched-domain:
  domain 0: span 0-1 level MC
   groups: 0 1
   domain 1: span 0-4 level DIE
    groups: 0-4 (cpu_power = 5120) 0-1 (cpu_power = 2048) <-- error !!!
 CPU1 attaching sched-domain:
  domain 0: span 0-1 level MC
   groups: 1 0
   domain 1: span 0-4 level DIE
    groups: 0-1 (cpu_power = 2048) 0-4 (cpu_power = 5120) <-- error !!!
 CPU2 attaching sched-domain:
  domain 0: span 2-4 level GMC
   groups: 2 3 4
   domain 1: span 0-4 level GDIE
    groups: 2-4 (cpu_power = 3072) 0-1 (cpu_power = 2048)
 ...

The feature I'm currently working on is to add sd energy information to
sd levels of the sd topology level table. I essentially added another
column of sd energy related func ptr's (next to the flags related one)
and wanted to use 'sd level folding via sd degenerate' in MC and DIE
level to have different sd energy information per CPU.

In any case, this dependency to FORCE_SD_OVERLAP would be less then nice
for this feature though :-( A way out would be a 'int cpu' parameter but
we already discussed this back then for the flag function.

Thanks,

-- Dietmar

> 
> Vincent
> 

[...]

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra April 25, 2014, 4:04 p.m. UTC | #6
> The example above is consistent because CPU2 mask and CPU0 mask are
> fully exclusive
> 
> so
> CPU0: cpu_corepower_mask=0-1
> CPU2: cpu_corepower_mask=2
> are consistent
> 
> CPU0: cpu_corepower_mask=0-2
> CPU2: cpu_corepower_mask=0-2
> are also consistent
> 
> but
> 
> CPU0: cpu_corepower_mask=0-1
> CPU2: cpu_corepower_mask=0-2
> are not consistent
> 
> and your example uses the last configuration
> 
> To be more precise, the rule above applies on default SDT definition
> but the flag SD_OVERLAP enables such kind of overlap between group.
> Have you tried it ?

I've never tried degenerate stuff with SD_OVERLAP, it might horribly
explode -- its not actually meant to work.

The SD_OVERLAP comes from not fully connected NUMA topologies; suppose
something like:

        0------1
        |      |
        |      |
	2------3

or:

 ( 10 20 20  0 )
 ( 20 10  0 20 )
 ( 20  0 10 20 )
 (  0 20 20 10 )

Your domain level that models the single-hop/20 distance has overlapping
masks:

N0: 0-2
N1: 0,1,3
N2: 0,2,3
N3: 1-3

I've never tried to construct a NUMA topology that would be overlapping
and have redundant bits in.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra April 25, 2014, 4:05 p.m. UTC | #7
On Fri, Apr 25, 2014 at 06:04:19PM +0200, Peter Zijlstra wrote:
> > The example above is consistent because CPU2 mask and CPU0 mask are
> > fully exclusive
> > 
> > so
> > CPU0: cpu_corepower_mask=0-1
> > CPU2: cpu_corepower_mask=2
> > are consistent
> > 
> > CPU0: cpu_corepower_mask=0-2
> > CPU2: cpu_corepower_mask=0-2
> > are also consistent
> > 
> > but
> > 
> > CPU0: cpu_corepower_mask=0-1
> > CPU2: cpu_corepower_mask=0-2
> > are not consistent
> > 
> > and your example uses the last configuration
> > 
> > To be more precise, the rule above applies on default SDT definition
> > but the flag SD_OVERLAP enables such kind of overlap between group.
> > Have you tried it ?
> 
> I've never tried degenerate stuff with SD_OVERLAP, it might horribly
> explode -- its not actually meant to work.
> 
> The SD_OVERLAP comes from not fully connected NUMA topologies; suppose
> something like:
> 
>         0------1
>         |      |
>         |      |
>         2------3
> 
> or:
> 
>  ( 10 20 20  0 )
>  ( 20 10  0 20 )
>  ( 20  0 10 20 )
>  (  0 20 20 10 )

d'0h: s/0/30/

0 <-> 3 is 2 hops, too focused on the single hop case

> Your domain level that models the single-hop/20 distance has overlapping
> masks:
> 
> N0: 0-2
> N1: 0,1,3
> N2: 0,2,3
> N3: 1-3
> 
> I've never tried to construct a NUMA topology that would be overlapping
> and have redundant bits in.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 71e1fec6d31a..803330d89c09 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -194,6 +194,14 @@  const struct cpumask *cpu_corepower_mask(int cpu)
 	return &cpu_topology[cpu].thread_sibling;
 }

+const struct cpumask *cpu_cpupower_mask(int cpu)
+{
+	return cpu_topology[cpu].socket_id ?
+			cpumask_of_node(cpu_to_node(cpu)) :
+			&cpu_topology[cpu].core_sibling;
+}
+
+
 static void update_siblings_masks(unsigned int cpuid)
 {
 	struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -280,11 +288,18 @@  static inline const int cpu_corepower_flags(void)
 	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
 }

+static inline const int cpu_cpupower_flags(void)
+{
+	return SD_SHARE_POWERDOMAIN;
+}
+
+
 static struct sched_domain_topology_level arm_topology[] = {
 #ifdef CONFIG_SCHED_MC
 	{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
+	{ cpu_cpupower_mask, cpu_cpupower_flags, SD_INIT_NAME(GDIE) },
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };