diff mbox series

[RFC,v5,1/4] topology: Represent clusters of CPUs within a die

Message ID 20210319041618.14316-2-song.bao.hua@hisilicon.com
State Superseded
Headers show
Series scheduler: expose the topology of clusters and add cluster scheduler | expand

Commit Message

Song Bao Hua (Barry Song) March 19, 2021, 4:16 a.m. UTC
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>


Both ACPI and DT provide the ability to describe additional layers of
topology between that of individual cores and higher level constructs
such as the level at which the last level cache is shared.
In ACPI this can be represented in PPTT as a Processor Hierarchy
Node Structure [1] that is the parent of the CPU cores and in turn
has a parent Processor Hierarchy Nodes Structure representing
a higher level of topology.

For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each
cluster has 4 cpus. All clusters share L3 cache data, but each cluster
has local L3 tag. On the other hand, each clusters will share some
internal system bus.

+-----------------------------------+                          +---------+
|  +------+    +------+            +---------------------------+         |
|  | CPU0 |    | cpu1 |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   +----+    L3     |         |         |
|  +------+    +------+   cluster   |    |    tag    |         |         |
|  | CPU2 |    | CPU3 |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   |    |    L3     |         |         |
|  +------+    +------+             +----+    tag    |         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |   L3    |
                                                               |   data  |
+-----------------------------------+                          |         |
|  +------+    +------+             |    +-----------+         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             +----+    L3     |         |         |
|                                   |    |    tag    |         |         |
|  +------+    +------+             |    |           |         |         |
|  |      |    |      |            ++    +-----------+         |         |
|  +------+    +------+            |---------------------------+         |
+-----------------------------------|                          |         |
+-----------------------------------|                          |         |
|  +------+    +------+            +---------------------------+         |
|  |      |    |      |             |    +-----------+         |         |
|  +------+    +------+             |    |           |         |         |
|                                   +----+    L3     |         |         |
|  +------+    +------+             |    |    tag    |         |         |
|  |      |    |      |             |    |           |         |         |
|  +------+    +------+             |    +-----------+         |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                          |         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |   +-----------+          |         |
|  +------+    +------+             |   |           |          |         |
|                                   |   |    L3     |          |         |
|  +------+    +------+             +---+    tag    |          |         |
|  |      |    |      |             |   |           |          |         |
|  +------+    +------+             |   +-----------+          |         |
|                                   |                          |         |
+-----------------------------------+                          |         |
+-----------------------------------+                         ++         |
|  +------+    +------+             +--------------------------+         |
|  |      |    |      |             |  +-----------+           |         |
|  +------+    +------+             |  |           |           |         |
|                                   |  |    L3     |           |         |
|  +------+    +------+             +--+    tag    |           |         |
|  |      |    |      |             |  |           |           |         |
|  +------+    +------+             |  +-----------+           |         |
|                                   |                          +---------+
+-----------------------------------+

That means the cost to transfer ownership of a cacheline between CPUs
within a cluster is lower than between CPUs in different clusters on
the same die. Hence, it can make sense to tell the scheduler to use
the cache affinity of the cluster to make better decision on thread
migration.

This patch simply exposes this information to userspace libraries
like hwloc by providing cluster_cpus and related sysfs attributes.
PoC of HWLOC support at [2].

Note this patch only handle the ACPI case.

Special consideration is needed for SMT processors, where it is
necessary to move 2 levels up the hierarchy from the leaf nodes
(thus skipping the processor core level).

Currently the ID provided is the offset of the Processor
Hierarchy Nodes Structure within PPTT.  Whilst this is unique
it is not terribly elegant so alternative suggestions welcome.

Note that arm64 / ACPI does not provide any means of identifying
a die level in the topology but that may be unrelate to the cluster
level.

[1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node
    structure (Type 0)
[2] https://github.com/hisilicon/hwloc/tree/linux-cluster

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>

---
 Documentation/admin-guide/cputopology.rst | 26 +++++++++++--
 arch/arm64/kernel/topology.c              |  2 +
 drivers/acpi/pptt.c                       | 63 +++++++++++++++++++++++++++++++
 drivers/base/arch_topology.c              | 15 ++++++++
 drivers/base/topology.c                   | 10 +++++
 include/linux/acpi.h                      |  5 +++
 include/linux/arch_topology.h             |  5 +++
 include/linux/topology.h                  |  6 +++
 8 files changed, 128 insertions(+), 4 deletions(-)

-- 
1.8.3.1

Comments

Greg Kroah-Hartman March 19, 2021, 6:35 a.m. UTC | #1
On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:
> diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst

> index b90dafc..f9d3745 100644

> --- a/Documentation/admin-guide/cputopology.rst

> +++ b/Documentation/admin-guide/cputopology.rst

> @@ -24,6 +24,12 @@ core_id:

>  	identifier (rather than the kernel's).  The actual value is

>  	architecture and platform dependent.

>  

> +cluster_id:

> +

> +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> +	identifier (rather than the kernel's).  The actual value is

> +	architecture and platform dependent.

> +

>  book_id:

>  

>  	the book ID of cpuX. Typically it is the hardware platform's

> @@ -56,6 +62,14 @@ package_cpus_list:

>  	human-readable list of CPUs sharing the same physical_package_id.

>  	(deprecated name: "core_siblings_list")

>  

> +cluster_cpus:

> +

> +	internal kernel map of CPUs within the same cluster.

> +

> +cluster_cpus_list:

> +

> +	human-readable list of CPUs within the same cluster.

> +

>  die_cpus:

>  

>  	internal kernel map of CPUs within the same die.


Why are these sysfs files in this file, and not in a Documentation/ABI/
file which can be correctly parsed and shown to userspace?

Any chance you can fix that up here as well?

Also note that "list" is not something that goes in sysfs, sysfs is "one
value per file", and a list is not "one value".  How do you prevent
overflowing the buffer of the sysfs file if you have a "list"?

thanks,

greg k-h
Song Bao Hua (Barry Song) March 19, 2021, 6:57 a.m. UTC | #2
> -----Original Message-----

> From: Greg KH [mailto:gregkh@linuxfoundation.org]

> Sent: Friday, March 19, 2021 7:35 PM

> To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org; peterz@infradead.org;

> dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com; Jonathan

> Cameron <jonathan.cameron@huawei.com>; juri.lelli@redhat.com;

> mark.rutland@arm.com; sudeep.holla@arm.com; aubrey.li@linux.intel.com;

> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;

> linux-acpi@vger.kernel.org; x86@kernel.org; xuwei (O) <xuwei5@huawei.com>;

> Zengtao (B) <prime.zeng@hisilicon.com>; guodong.xu@linaro.org; yangyicong

> <yangyicong@huawei.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;

> linuxarm@openeuler.org; hpa@zytor.com

> Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> a die

> 

> On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:

> > diff --git a/Documentation/admin-guide/cputopology.rst

> b/Documentation/admin-guide/cputopology.rst

> > index b90dafc..f9d3745 100644

> > --- a/Documentation/admin-guide/cputopology.rst

> > +++ b/Documentation/admin-guide/cputopology.rst

> > @@ -24,6 +24,12 @@ core_id:

> >  	identifier (rather than the kernel's).  The actual value is

> >  	architecture and platform dependent.

> >

> > +cluster_id:

> > +

> > +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> > +	identifier (rather than the kernel's).  The actual value is

> > +	architecture and platform dependent.

> > +

> >  book_id:

> >

> >  	the book ID of cpuX. Typically it is the hardware platform's

> > @@ -56,6 +62,14 @@ package_cpus_list:

> >  	human-readable list of CPUs sharing the same physical_package_id.

> >  	(deprecated name: "core_siblings_list")

> >

> > +cluster_cpus:

> > +

> > +	internal kernel map of CPUs within the same cluster.

> > +

> > +cluster_cpus_list:

> > +

> > +	human-readable list of CPUs within the same cluster.

> > +

> >  die_cpus:

> >

> >  	internal kernel map of CPUs within the same die.

> 

> Why are these sysfs files in this file, and not in a Documentation/ABI/

> file which can be correctly parsed and shown to userspace?


Well. Those ABIs have been there for much a long time. It is like:

[root@ceph1 topology]# ls
core_id  core_siblings  core_siblings_list  physical_package_id thread_siblings  thread_siblings_list
[root@ceph1 topology]# pwd
/sys/devices/system/cpu/cpu100/topology
[root@ceph1 topology]# cat core_siblings_list
64-127
[root@ceph1 topology]#

> 

> Any chance you can fix that up here as well?


Yes. we will send a separate patch to address this, which won't
be in this patchset. This patchset will base on that one.

> 

> Also note that "list" is not something that goes in sysfs, sysfs is "one

> value per file", and a list is not "one value".  How do you prevent

> overflowing the buffer of the sysfs file if you have a "list"?

> 


At a glance, the list is using "-" rather than a real list
[root@ceph1 topology]# cat core_siblings_list
64-127

Anyway, I will take a look if it has any chance to overflow.

> thanks,

> 

> greg k-h


Thanks
Barry
Jonathan Cameron March 19, 2021, 9:36 a.m. UTC | #3
On Fri, 19 Mar 2021 06:57:08 +0000
"Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> > -----Original Message-----

> > From: Greg KH [mailto:gregkh@linuxfoundation.org]

> > Sent: Friday, March 19, 2021 7:35 PM

> > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> > Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> > rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> > tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org; peterz@infradead.org;

> > dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> > mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com; Jonathan

> > Cameron <jonathan.cameron@huawei.com>; juri.lelli@redhat.com;

> > mark.rutland@arm.com; sudeep.holla@arm.com; aubrey.li@linux.intel.com;

> > linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;

> > linux-acpi@vger.kernel.org; x86@kernel.org; xuwei (O) <xuwei5@huawei.com>;

> > Zengtao (B) <prime.zeng@hisilicon.com>; guodong.xu@linaro.org; yangyicong

> > <yangyicong@huawei.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;

> > linuxarm@openeuler.org; hpa@zytor.com

> > Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> > a die

> > 

> > On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:  

> > > diff --git a/Documentation/admin-guide/cputopology.rst  

> > b/Documentation/admin-guide/cputopology.rst  

> > > index b90dafc..f9d3745 100644

> > > --- a/Documentation/admin-guide/cputopology.rst

> > > +++ b/Documentation/admin-guide/cputopology.rst

> > > @@ -24,6 +24,12 @@ core_id:

> > >  	identifier (rather than the kernel's).  The actual value is

> > >  	architecture and platform dependent.

> > >

> > > +cluster_id:

> > > +

> > > +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> > > +	identifier (rather than the kernel's).  The actual value is

> > > +	architecture and platform dependent.

> > > +

> > >  book_id:

> > >

> > >  	the book ID of cpuX. Typically it is the hardware platform's

> > > @@ -56,6 +62,14 @@ package_cpus_list:

> > >  	human-readable list of CPUs sharing the same physical_package_id.

> > >  	(deprecated name: "core_siblings_list")

> > >

> > > +cluster_cpus:

> > > +

> > > +	internal kernel map of CPUs within the same cluster.

> > > +

> > > +cluster_cpus_list:

> > > +

> > > +	human-readable list of CPUs within the same cluster.

> > > +

> > >  die_cpus:

> > >

> > >  	internal kernel map of CPUs within the same die.  

> > 

> > Why are these sysfs files in this file, and not in a Documentation/ABI/

> > file which can be correctly parsed and shown to userspace?  

> 

> Well. Those ABIs have been there for much a long time. It is like:

> 

> [root@ceph1 topology]# ls

> core_id  core_siblings  core_siblings_list  physical_package_id thread_siblings  thread_siblings_list

> [root@ceph1 topology]# pwd

> /sys/devices/system/cpu/cpu100/topology

> [root@ceph1 topology]# cat core_siblings_list

> 64-127

> [root@ceph1 topology]#

> 

> > 

> > Any chance you can fix that up here as well?  

> 

> Yes. we will send a separate patch to address this, which won't

> be in this patchset. This patchset will base on that one.

> 

> > 

> > Also note that "list" is not something that goes in sysfs, sysfs is "one

> > value per file", and a list is not "one value".  How do you prevent

> > overflowing the buffer of the sysfs file if you have a "list"?

> >   

> 

> At a glance, the list is using "-" rather than a real list

> [root@ceph1 topology]# cat core_siblings_list

> 64-127

> 

> Anyway, I will take a look if it has any chance to overflow.


It could in theory be alternate CPUs as comma separated list.
So it's would get interesting around 500-1000 cpus (guessing).

Hopefully no one has that crazy a cpu numbering scheme but it's possible
(note that cluster is fine for this, but I guess it might eventually
happen for core-siblings list (cpus within a package).

Shouldn't crash or anything like that but might terminate early.

On sysfs file conversion, that got mentioned earlier but I forgot
to remind Barry about it when he took this patch into his series.
Sorry about that!

Jonathan


> 

> > thanks,

> > 

> > greg k-h  

> 

> Thanks

> Barry

>
Greg Kroah-Hartman March 19, 2021, 10:01 a.m. UTC | #4
On Fri, Mar 19, 2021 at 09:36:16AM +0000, Jonathan Cameron wrote:
> On Fri, 19 Mar 2021 06:57:08 +0000

> "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> 

> > > -----Original Message-----

> > > From: Greg KH [mailto:gregkh@linuxfoundation.org]

> > > Sent: Friday, March 19, 2021 7:35 PM

> > > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> > > Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> > > rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> > > tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org; peterz@infradead.org;

> > > dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> > > mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com; Jonathan

> > > Cameron <jonathan.cameron@huawei.com>; juri.lelli@redhat.com;

> > > mark.rutland@arm.com; sudeep.holla@arm.com; aubrey.li@linux.intel.com;

> > > linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;

> > > linux-acpi@vger.kernel.org; x86@kernel.org; xuwei (O) <xuwei5@huawei.com>;

> > > Zengtao (B) <prime.zeng@hisilicon.com>; guodong.xu@linaro.org; yangyicong

> > > <yangyicong@huawei.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;

> > > linuxarm@openeuler.org; hpa@zytor.com

> > > Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> > > a die

> > > 

> > > On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:  

> > > > diff --git a/Documentation/admin-guide/cputopology.rst  

> > > b/Documentation/admin-guide/cputopology.rst  

> > > > index b90dafc..f9d3745 100644

> > > > --- a/Documentation/admin-guide/cputopology.rst

> > > > +++ b/Documentation/admin-guide/cputopology.rst

> > > > @@ -24,6 +24,12 @@ core_id:

> > > >  	identifier (rather than the kernel's).  The actual value is

> > > >  	architecture and platform dependent.

> > > >

> > > > +cluster_id:

> > > > +

> > > > +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> > > > +	identifier (rather than the kernel's).  The actual value is

> > > > +	architecture and platform dependent.

> > > > +

> > > >  book_id:

> > > >

> > > >  	the book ID of cpuX. Typically it is the hardware platform's

> > > > @@ -56,6 +62,14 @@ package_cpus_list:

> > > >  	human-readable list of CPUs sharing the same physical_package_id.

> > > >  	(deprecated name: "core_siblings_list")

> > > >

> > > > +cluster_cpus:

> > > > +

> > > > +	internal kernel map of CPUs within the same cluster.

> > > > +

> > > > +cluster_cpus_list:

> > > > +

> > > > +	human-readable list of CPUs within the same cluster.

> > > > +

> > > >  die_cpus:

> > > >

> > > >  	internal kernel map of CPUs within the same die.  

> > > 

> > > Why are these sysfs files in this file, and not in a Documentation/ABI/

> > > file which can be correctly parsed and shown to userspace?  

> > 

> > Well. Those ABIs have been there for much a long time. It is like:

> > 

> > [root@ceph1 topology]# ls

> > core_id  core_siblings  core_siblings_list  physical_package_id thread_siblings  thread_siblings_list

> > [root@ceph1 topology]# pwd

> > /sys/devices/system/cpu/cpu100/topology

> > [root@ceph1 topology]# cat core_siblings_list

> > 64-127

> > [root@ceph1 topology]#

> > 

> > > 

> > > Any chance you can fix that up here as well?  

> > 

> > Yes. we will send a separate patch to address this, which won't

> > be in this patchset. This patchset will base on that one.

> > 

> > > 

> > > Also note that "list" is not something that goes in sysfs, sysfs is "one

> > > value per file", and a list is not "one value".  How do you prevent

> > > overflowing the buffer of the sysfs file if you have a "list"?

> > >   

> > 

> > At a glance, the list is using "-" rather than a real list

> > [root@ceph1 topology]# cat core_siblings_list

> > 64-127

> > 

> > Anyway, I will take a look if it has any chance to overflow.

> 

> It could in theory be alternate CPUs as comma separated list.

> So it's would get interesting around 500-1000 cpus (guessing).

> 

> Hopefully no one has that crazy a cpu numbering scheme but it's possible

> (note that cluster is fine for this, but I guess it might eventually

> happen for core-siblings list (cpus within a package).

> 

> Shouldn't crash or anything like that but might terminate early.


We have a broken sysfs api already for listing LED numbers that has had
to be worked around in the past, please do not create a new one with
that same problem, we should learn from them :)

thanks,

greg k-h
Song Bao Hua (Barry Song) April 20, 2021, 3:30 a.m. UTC | #5
> -----Original Message-----

> From: Greg KH [mailto:gregkh@linuxfoundation.org]

> Sent: Friday, March 19, 2021 11:02 PM

> To: Jonathan Cameron <jonathan.cameron@huawei.com>

> Cc: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>;

> tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org; peterz@infradead.org;

> dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com;

> juri.lelli@redhat.com; mark.rutland@arm.com; sudeep.holla@arm.com;

> aubrey.li@linux.intel.com; linux-arm-kernel@lists.infradead.org;

> linux-kernel@vger.kernel.org; linux-acpi@vger.kernel.org; x86@kernel.org;

> xuwei (O) <xuwei5@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>;

> guodong.xu@linaro.org; yangyicong <yangyicong@huawei.com>; Liguozhu (Kenneth)

> <liguozhu@hisilicon.com>; linuxarm@openeuler.org; hpa@zytor.com

> Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> a die

> 

> On Fri, Mar 19, 2021 at 09:36:16AM +0000, Jonathan Cameron wrote:

> > On Fri, 19 Mar 2021 06:57:08 +0000

> > "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> >

> > > > -----Original Message-----

> > > > From: Greg KH [mailto:gregkh@linuxfoundation.org]

> > > > Sent: Friday, March 19, 2021 7:35 PM

> > > > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> > > > Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com;

> will@kernel.org;

> > > > rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> > > > tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org;

> peterz@infradead.org;

> > > > dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> > > > mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com;

> Jonathan

> > > > Cameron <jonathan.cameron@huawei.com>; juri.lelli@redhat.com;

> > > > mark.rutland@arm.com; sudeep.holla@arm.com; aubrey.li@linux.intel.com;

> > > > linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;

> > > > linux-acpi@vger.kernel.org; x86@kernel.org; xuwei (O)

> <xuwei5@huawei.com>;

> > > > Zengtao (B) <prime.zeng@hisilicon.com>; guodong.xu@linaro.org;

> yangyicong

> > > > <yangyicong@huawei.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;

> > > > linuxarm@openeuler.org; hpa@zytor.com

> > > > Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> > > > a die

> > > >

> > > > On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:

> > > > > diff --git a/Documentation/admin-guide/cputopology.rst

> > > > b/Documentation/admin-guide/cputopology.rst

> > > > > index b90dafc..f9d3745 100644

> > > > > --- a/Documentation/admin-guide/cputopology.rst

> > > > > +++ b/Documentation/admin-guide/cputopology.rst

> > > > > @@ -24,6 +24,12 @@ core_id:

> > > > >  	identifier (rather than the kernel's).  The actual value is

> > > > >  	architecture and platform dependent.

> > > > >

> > > > > +cluster_id:

> > > > > +

> > > > > +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> > > > > +	identifier (rather than the kernel's).  The actual value is

> > > > > +	architecture and platform dependent.

> > > > > +

> > > > >  book_id:

> > > > >

> > > > >  	the book ID of cpuX. Typically it is the hardware platform's

> > > > > @@ -56,6 +62,14 @@ package_cpus_list:

> > > > >  	human-readable list of CPUs sharing the same physical_package_id.

> > > > >  	(deprecated name: "core_siblings_list")

> > > > >

> > > > > +cluster_cpus:

> > > > > +

> > > > > +	internal kernel map of CPUs within the same cluster.

> > > > > +

> > > > > +cluster_cpus_list:

> > > > > +

> > > > > +	human-readable list of CPUs within the same cluster.

> > > > > +

> > > > >  die_cpus:

> > > > >

> > > > >  	internal kernel map of CPUs within the same die.

> > > >

> > > > Why are these sysfs files in this file, and not in a Documentation/ABI/

> > > > file which can be correctly parsed and shown to userspace?

> > >

> > > Well. Those ABIs have been there for much a long time. It is like:

> > >

> > > [root@ceph1 topology]# ls

> > > core_id  core_siblings  core_siblings_list  physical_package_id

> thread_siblings  thread_siblings_list

> > > [root@ceph1 topology]# pwd

> > > /sys/devices/system/cpu/cpu100/topology

> > > [root@ceph1 topology]# cat core_siblings_list

> > > 64-127

> > > [root@ceph1 topology]#

> > >

> > > >

> > > > Any chance you can fix that up here as well?

> > >

> > > Yes. we will send a separate patch to address this, which won't

> > > be in this patchset. This patchset will base on that one.

> > >

> > > >

> > > > Also note that "list" is not something that goes in sysfs, sysfs is "one

> > > > value per file", and a list is not "one value".  How do you prevent

> > > > overflowing the buffer of the sysfs file if you have a "list"?

> > > >

> > >

> > > At a glance, the list is using "-" rather than a real list

> > > [root@ceph1 topology]# cat core_siblings_list

> > > 64-127

> > >

> > > Anyway, I will take a look if it has any chance to overflow.

> >

> > It could in theory be alternate CPUs as comma separated list.

> > So it's would get interesting around 500-1000 cpus (guessing).

> >

> > Hopefully no one has that crazy a cpu numbering scheme but it's possible

> > (note that cluster is fine for this, but I guess it might eventually

> > happen for core-siblings list (cpus within a package).

> >

> > Shouldn't crash or anything like that but might terminate early.

> 

> We have a broken sysfs api already for listing LED numbers that has had

> to be worked around in the past, please do not create a new one with

> that same problem, we should learn from them :)


Another place I am seeing a cpu list is in numa topology:
/sys/devices/system/node/nodex/cpulist.

But the code has a BUILD_BUG_ON to guard the pagebuf:

static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
{
	ssize_t n;
	cpumask_var_t mask;
	struct node *node_dev = to_node(dev);

	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));

	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
		return 0;

	cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
	n = cpumap_print_to_pagebuf(list, buf, mask);
	free_cpumask_var(mask);

	return n;
}

For lists in cpu topology, I haven't seen this while I believe we need it.
Or am I missing something?

> 

> thanks,

> 

> greg k-h


Thanks
Barry
Song Bao Hua (Barry Song) April 21, 2021, 4:06 a.m. UTC | #6
> -----Original Message-----

> From: Song Bao Hua (Barry Song)

> Sent: Tuesday, April 20, 2021 3:24 PM

> To: 'Greg KH' <gregkh@linuxfoundation.org>; Jonathan Cameron

> <jonathan.cameron@huawei.com>

> Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org; peterz@infradead.org;

> dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com;

> juri.lelli@redhat.com; mark.rutland@arm.com; sudeep.holla@arm.com;

> aubrey.li@linux.intel.com; linux-arm-kernel@lists.infradead.org;

> linux-kernel@vger.kernel.org; linux-acpi@vger.kernel.org; x86@kernel.org;

> xuwei (O) <xuwei5@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>;

> guodong.xu@linaro.org; yangyicong <yangyicong@huawei.com>; Liguozhu (Kenneth)

> <liguozhu@hisilicon.com>; linuxarm@openeuler.org; hpa@zytor.com; tiantao (H)

> <tiantao6@hisilicon.com>

> Subject: RE: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> a die

> 

> 

> 

> > -----Original Message-----

> > From: Greg KH [mailto:gregkh@linuxfoundation.org]

> > Sent: Friday, March 19, 2021 11:02 PM

> > To: Jonathan Cameron <jonathan.cameron@huawei.com>

> > Cc: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>;

> > tim.c.chen@linux.intel.com; catalin.marinas@arm.com; will@kernel.org;

> > rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> > tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org;

> peterz@infradead.org;

> > dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> > mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com;

> > juri.lelli@redhat.com; mark.rutland@arm.com; sudeep.holla@arm.com;

> > aubrey.li@linux.intel.com; linux-arm-kernel@lists.infradead.org;

> > linux-kernel@vger.kernel.org; linux-acpi@vger.kernel.org; x86@kernel.org;

> > xuwei (O) <xuwei5@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>;

> > guodong.xu@linaro.org; yangyicong <yangyicong@huawei.com>; Liguozhu

> (Kenneth)

> > <liguozhu@hisilicon.com>; linuxarm@openeuler.org; hpa@zytor.com

> > Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs within

> > a die

> >

> > On Fri, Mar 19, 2021 at 09:36:16AM +0000, Jonathan Cameron wrote:

> > > On Fri, 19 Mar 2021 06:57:08 +0000

> > > "Song Bao Hua (Barry Song)" <song.bao.hua@hisilicon.com> wrote:

> > >

> > > > > -----Original Message-----

> > > > > From: Greg KH [mailto:gregkh@linuxfoundation.org]

> > > > > Sent: Friday, March 19, 2021 7:35 PM

> > > > > To: Song Bao Hua (Barry Song) <song.bao.hua@hisilicon.com>

> > > > > Cc: tim.c.chen@linux.intel.com; catalin.marinas@arm.com;

> > will@kernel.org;

> > > > > rjw@rjwysocki.net; vincent.guittot@linaro.org; bp@alien8.de;

> > > > > tglx@linutronix.de; mingo@redhat.com; lenb@kernel.org;

> > peterz@infradead.org;

> > > > > dietmar.eggemann@arm.com; rostedt@goodmis.org; bsegall@google.com;

> > > > > mgorman@suse.de; msys.mizuma@gmail.com; valentin.schneider@arm.com;

> > Jonathan

> > > > > Cameron <jonathan.cameron@huawei.com>; juri.lelli@redhat.com;

> > > > > mark.rutland@arm.com; sudeep.holla@arm.com; aubrey.li@linux.intel.com;

> > > > > linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org;

> > > > > linux-acpi@vger.kernel.org; x86@kernel.org; xuwei (O)

> > <xuwei5@huawei.com>;

> > > > > Zengtao (B) <prime.zeng@hisilicon.com>; guodong.xu@linaro.org;

> > yangyicong

> > > > > <yangyicong@huawei.com>; Liguozhu (Kenneth) <liguozhu@hisilicon.com>;

> > > > > linuxarm@openeuler.org; hpa@zytor.com

> > > > > Subject: Re: [RFC PATCH v5 1/4] topology: Represent clusters of CPUs

> within

> > > > > a die

> > > > >

> > > > > On Fri, Mar 19, 2021 at 05:16:15PM +1300, Barry Song wrote:

> > > > > > diff --git a/Documentation/admin-guide/cputopology.rst

> > > > > b/Documentation/admin-guide/cputopology.rst

> > > > > > index b90dafc..f9d3745 100644

> > > > > > --- a/Documentation/admin-guide/cputopology.rst

> > > > > > +++ b/Documentation/admin-guide/cputopology.rst

> > > > > > @@ -24,6 +24,12 @@ core_id:

> > > > > >  	identifier (rather than the kernel's).  The actual value is

> > > > > >  	architecture and platform dependent.

> > > > > >

> > > > > > +cluster_id:

> > > > > > +

> > > > > > +	the Cluster ID of cpuX.  Typically it is the hardware platform's

> > > > > > +	identifier (rather than the kernel's).  The actual value is

> > > > > > +	architecture and platform dependent.

> > > > > > +

> > > > > >  book_id:

> > > > > >

> > > > > >  	the book ID of cpuX. Typically it is the hardware platform's

> > > > > > @@ -56,6 +62,14 @@ package_cpus_list:

> > > > > >  	human-readable list of CPUs sharing the same physical_package_id.

> > > > > >  	(deprecated name: "core_siblings_list")

> > > > > >

> > > > > > +cluster_cpus:

> > > > > > +

> > > > > > +	internal kernel map of CPUs within the same cluster.

> > > > > > +

> > > > > > +cluster_cpus_list:

> > > > > > +

> > > > > > +	human-readable list of CPUs within the same cluster.

> > > > > > +

> > > > > >  die_cpus:

> > > > > >

> > > > > >  	internal kernel map of CPUs within the same die.

> > > > >

> > > > > Why are these sysfs files in this file, and not in a Documentation/ABI/

> > > > > file which can be correctly parsed and shown to userspace?

> > > >

> > > > Well. Those ABIs have been there for much a long time. It is like:

> > > >

> > > > [root@ceph1 topology]# ls

> > > > core_id  core_siblings  core_siblings_list  physical_package_id

> > thread_siblings  thread_siblings_list

> > > > [root@ceph1 topology]# pwd

> > > > /sys/devices/system/cpu/cpu100/topology

> > > > [root@ceph1 topology]# cat core_siblings_list

> > > > 64-127

> > > > [root@ceph1 topology]#

> > > >

> > > > >

> > > > > Any chance you can fix that up here as well?

> > > >

> > > > Yes. we will send a separate patch to address this, which won't

> > > > be in this patchset. This patchset will base on that one.

> > > >

> > > > >

> > > > > Also note that "list" is not something that goes in sysfs, sysfs is

> "one

> > > > > value per file", and a list is not "one value".  How do you prevent

> > > > > overflowing the buffer of the sysfs file if you have a "list"?

> > > > >

> > > >

> > > > At a glance, the list is using "-" rather than a real list

> > > > [root@ceph1 topology]# cat core_siblings_list

> > > > 64-127

> > > >

> > > > Anyway, I will take a look if it has any chance to overflow.

> > >

> > > It could in theory be alternate CPUs as comma separated list.

> > > So it's would get interesting around 500-1000 cpus (guessing).

> > >

> > > Hopefully no one has that crazy a cpu numbering scheme but it's possible

> > > (note that cluster is fine for this, but I guess it might eventually

> > > happen for core-siblings list (cpus within a package).

> > >

> > > Shouldn't crash or anything like that but might terminate early.

> >

> > We have a broken sysfs api already for listing LED numbers that has had

> > to be worked around in the past, please do not create a new one with

> > that same problem, we should learn from them :)

> 

> Another place I am seeing a cpu list is in numa topology:

> /sys/devices/system/node/nodex/cpulist.

> 

> But the code has a BUILD_BUG_ON to guard the pagebuf:

> 

> static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)

> {

> 	ssize_t n;

> 	cpumask_var_t mask;

> 	struct node *node_dev = to_node(dev);

> 

> 	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */

> 	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));

> 

> 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))

> 		return 0;

> 

> 	cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);

> 	n = cpumap_print_to_pagebuf(list, buf, mask);

> 	free_cpumask_var(mask);

> 

> 	return n;

> }

> 

> For lists in cpu topology, I haven't seen this while I believe we need it.

> Or am I missing something?


I would prefer we send two patches as a series
"clarify and cleanup CPU and NUMA topology ABIs" with a cover
letter and the below one as 1/2. 2/2 would be the patch moving
the place of cpu topology ABI doc.

From b32c0c00a187d4fe4c49d54d30650b0cacb2c351 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>

Date: Wed, 21 Apr 2021 14:36:11 +1200
Subject: [PATCH 1/2] CPU, NUMA topology ABIs: clarify the overflow issue of sysfs
 pagebuf

Both numa node and cpu use cpu bitmap like 3,ffffffff to expose hardware
topology. When cpu number is large, the page buffer of sysfs will over-
flow. This doesn't really happen nowadays as the maximum NR_CPUS is 8196
for X86_64 and 4096 for ARM64 since 8196 * 9 / 32 = 2305 is still smaller
than 4KB page size.
So the existing BUILD_BUG_ON() in drivers/base/node.c is pretty much
preventing future problems similar with Y2K when hardware gets more
and more CPUs.
On the other hand, it should be more sensible to move the guard to common
code which can protect both cpu and numa:
/sys/devices/system/cpu/cpu0/topology/die_cpus etc.
/sys/devices/system/node/node0/cpumap etc.

Topology bitmap mask strings shouldn't be larger than PAGE_SIZE as
lstopo and numactl depend on them. But other ABIs exposing cpu lists
are not really used by common applications, so this patch also marks
those lists could be trimmed as there is no any guarantee those lists
are always less than PAGE_SIZE especially a list could be like this:
0, 3, 5, 7, 9, 11... etc.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>

---
 Documentation/ABI/stable/sysfs-devices-node |  5 ++++-
 Documentation/admin-guide/cputopology.rst   | 15 +++++++++++++++
 drivers/base/node.c                         |  3 ---
 include/linux/cpumask.h                     |  6 ++++++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 484fc04bcc25..9832a17b2b15 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -47,7 +47,10 @@ What:		/sys/devices/system/node/nodeX/cpulist
 Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
-		The CPUs associated to the node.
+		The CPUs associated to the node. The format is like 0-3,
+		8-11, 12-13. The maximum size is PAGE_SIZE, so the tail
+		of the string will be trimmed while its size is larger
+		than PAGE_SIZE.
 
 What:		/sys/devices/system/node/nodeX/meminfo
 Date:		October 2002
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
index b90dafcc8237..8fac776a5ffa 100644
--- a/Documentation/admin-guide/cputopology.rst
+++ b/Documentation/admin-guide/cputopology.rst
@@ -44,6 +44,9 @@ core_cpus:
 core_cpus_list:
 
 	human-readable list of CPUs within the same core.
+	The format is like 0-3, 8-11, 12-13. The maximum size is PAGE_SIZE,
+	so the tail of the string will be trimmed while its size is larger
+	than PAGE_SIZE.
 	(deprecated name: "thread_siblings_list");
 
 package_cpus:
@@ -54,6 +57,9 @@ package_cpus:
 package_cpus_list:
 
 	human-readable list of CPUs sharing the same physical_package_id.
+	The format is like 0-3, 8-11, 12-13. The maximum size is PAGE_SIZE,
+	so the tail of the string will be trimmed while its size is larger
+	than PAGE_SIZE.
 	(deprecated name: "core_siblings_list")
 
 die_cpus:
@@ -63,6 +69,9 @@ die_cpus:
 die_cpus_list:
 
 	human-readable list of CPUs within the same die.
+	The format is like 0-3, 8-11, 12-13. The maximum size is PAGE_SIZE,
+	so the tail of the string will be trimmed while its size is larger
+	than PAGE_SIZE.
 
 book_siblings:
 
@@ -73,6 +82,9 @@ book_siblings_list:
 
 	human-readable list of cpuX's hardware threads within the same
 	book_id.
+	The format is like 0-3, 8-11, 12-13. The maximum size is PAGE_SIZE,
+	so the tail of the string will be trimmed while its size is larger
+	than PAGE_SIZE.
 
 drawer_siblings:
 
@@ -83,6 +95,9 @@ drawer_siblings_list:
 
 	human-readable list of cpuX's hardware threads within the same
 	drawer_id.
+	The format is like 0-3, 8-11, 12-13. The maximum size is PAGE_SIZE,
+	so the tail of the string will be trimmed while its size is larger
+	than PAGE_SIZE.
 
 Architecture-neutral, drivers/base/topology.c, exports these attributes.
 However, the book and drawer related sysfs files will only be created if
diff --git a/drivers/base/node.c b/drivers/base/node.c
index f449dbb2c746..50324d06bcd5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -33,9 +33,6 @@ static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
 	cpumask_var_t mask;
 	struct node *node_dev = to_node(dev);
 
-	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
-	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
-
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return 0;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e30f12..81f145e0c742 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -12,6 +12,7 @@
 #include <linux/bitmap.h>
 #include <linux/atomic.h>
 #include <linux/bug.h>
+#include <asm/page.h>
 
 /* Don't assign or return these: may not be this big! */
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
@@ -924,6 +925,11 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 static inline ssize_t
 cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
 {
+	/*
+	 * 32bits requires 9bytes: "ff,ffffffff", thus, too many CPUs will
+	 * cause the overflow of sysfs pagebuf
+	 */
+	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
 				      nr_cpu_ids);
 }
-- 
2.25.1

Thanks
Barry
diff mbox series

Patch

diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
index b90dafc..f9d3745 100644
--- a/Documentation/admin-guide/cputopology.rst
+++ b/Documentation/admin-guide/cputopology.rst
@@ -24,6 +24,12 @@  core_id:
 	identifier (rather than the kernel's).  The actual value is
 	architecture and platform dependent.
 
+cluster_id:
+
+	the Cluster ID of cpuX.  Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
 book_id:
 
 	the book ID of cpuX. Typically it is the hardware platform's
@@ -56,6 +62,14 @@  package_cpus_list:
 	human-readable list of CPUs sharing the same physical_package_id.
 	(deprecated name: "core_siblings_list")
 
+cluster_cpus:
+
+	internal kernel map of CPUs within the same cluster.
+
+cluster_cpus_list:
+
+	human-readable list of CPUs within the same cluster.
+
 die_cpus:
 
 	internal kernel map of CPUs within the same die.
@@ -96,11 +110,13 @@  these macros in include/asm-XXX/topology.h::
 
 	#define topology_physical_package_id(cpu)
 	#define topology_die_id(cpu)
+	#define topology_cluster_id(cpu)
 	#define topology_core_id(cpu)
 	#define topology_book_id(cpu)
 	#define topology_drawer_id(cpu)
 	#define topology_sibling_cpumask(cpu)
 	#define topology_core_cpumask(cpu)
+	#define topology_cluster_cpumask(cpu)
 	#define topology_die_cpumask(cpu)
 	#define topology_book_cpumask(cpu)
 	#define topology_drawer_cpumask(cpu)
@@ -116,10 +132,12 @@  not defined by include/asm-XXX/topology.h:
 
 1) topology_physical_package_id: -1
 2) topology_die_id: -1
-3) topology_core_id: 0
-4) topology_sibling_cpumask: just the given CPU
-5) topology_core_cpumask: just the given CPU
-6) topology_die_cpumask: just the given CPU
+3) topology_cluster_id: -1
+4) topology_core_id: 0
+5) topology_sibling_cpumask: just the given CPU
+6) topology_core_cpumask: just the given CPU
+7) topology_cluster_cpumask: just the given CPU
+8) topology_die_cpumask: just the given CPU
 
 For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
 default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index e08a412..d72eb8d 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -103,6 +103,8 @@  int __init parse_acpi_topology(void)
 			cpu_topology[cpu].thread_id  = -1;
 			cpu_topology[cpu].core_id    = topology_id;
 		}
+		topology_id = find_acpi_cpu_topology_cluster(cpu);
+		cpu_topology[cpu].cluster_id = topology_id;
 		topology_id = find_acpi_cpu_topology_package(cpu);
 		cpu_topology[cpu].package_id = topology_id;
 
diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index 4ae9335..11f8b02 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -737,6 +737,69 @@  int find_acpi_cpu_topology_package(unsigned int cpu)
 }
 
 /**
+ * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value
+ * @cpu: Kernel logical CPU number
+ *
+ * Determine a topology unique cluster ID for the given CPU/thread.
+ * This ID can then be used to group peers, which will have matching ids.
+ *
+ * The cluster, if present is the level of topology above CPUs. In a
+ * multi-thread CPU, it will be the level above the CPU, not the thread.
+ * It may not exist in single CPU systems. In simple multi-CPU systems,
+ * it may be equal to the package topology level.
+ *
+ * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found
+ * or there is no toplogy level above the CPU..
+ * Otherwise returns a value which represents the package for this CPU.
+ */
+
+int find_acpi_cpu_topology_cluster(unsigned int cpu)
+{
+	struct acpi_table_header *table;
+	acpi_status status;
+	struct acpi_pptt_processor *cpu_node, *cluster_node;
+	u32 acpi_cpu_id;
+	int retval;
+	int is_thread;
+
+	status = acpi_get_table(ACPI_SIG_PPTT, 0, &table);
+	if (ACPI_FAILURE(status)) {
+		acpi_pptt_warn_missing();
+		return -ENOENT;
+	}
+
+	acpi_cpu_id = get_acpi_id_for_cpu(cpu);
+	cpu_node = acpi_find_processor_node(table, acpi_cpu_id);
+	if (cpu_node == NULL || !cpu_node->parent) {
+		retval = -ENOENT;
+		goto put_table;
+	}
+
+	is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD;
+	cluster_node = fetch_pptt_node(table, cpu_node->parent);
+	if (cluster_node == NULL) {
+		retval = -ENOENT;
+		goto put_table;
+	}
+	if (is_thread) {
+		if (!cluster_node->parent) {
+			retval = -ENOENT;
+			goto put_table;
+		}
+		cluster_node = fetch_pptt_node(table, cluster_node->parent);
+		if (cluster_node == NULL) {
+			retval = -ENOENT;
+			goto put_table;
+		}
+	}
+	retval = ACPI_PTR_DIFF(cluster_node, table);
+put_table:
+	acpi_put_table(table);
+
+	return retval;
+}
+
+/**
  * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag
  * @cpu: Kernel logical CPU number
  *
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index de8587c..ca3b8c1 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -506,6 +506,11 @@  const struct cpumask *cpu_coregroup_mask(int cpu)
 	return core_mask;
 }
 
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+	return &cpu_topology[cpu].cluster_sibling;
+}
+
 void update_siblings_masks(unsigned int cpuid)
 {
 	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -523,6 +528,11 @@  void update_siblings_masks(unsigned int cpuid)
 		if (cpuid_topo->package_id != cpu_topo->package_id)
 			continue;
 
+		if (cpuid_topo->cluster_id == cpu_topo->cluster_id) {
+			cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling);
+			cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling);
+		}
+
 		cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
 		cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
 
@@ -541,6 +551,9 @@  static void clear_cpu_topology(int cpu)
 	cpumask_clear(&cpu_topo->llc_sibling);
 	cpumask_set_cpu(cpu, &cpu_topo->llc_sibling);
 
+	cpumask_clear(&cpu_topo->cluster_sibling);
+	cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling);
+
 	cpumask_clear(&cpu_topo->core_sibling);
 	cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
 	cpumask_clear(&cpu_topo->thread_sibling);
@@ -556,6 +569,7 @@  void __init reset_cpu_topology(void)
 
 		cpu_topo->thread_id = -1;
 		cpu_topo->core_id = -1;
+		cpu_topo->cluster_id = -1;
 		cpu_topo->package_id = -1;
 		cpu_topo->llc_id = -1;
 
@@ -571,6 +585,7 @@  void remove_cpu_topology(unsigned int cpu)
 		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
 	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
 		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+
 	for_each_cpu(sibling, topology_llc_cpumask(cpu))
 		cpumask_clear_cpu(cpu, topology_llc_cpumask(sibling));
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 4d254fc..7157ac0 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -46,6 +46,9 @@ 
 define_id_show_func(die_id);
 static DEVICE_ATTR_RO(die_id);
 
+define_id_show_func(cluster_id);
+static DEVICE_ATTR_RO(cluster_id);
+
 define_id_show_func(core_id);
 static DEVICE_ATTR_RO(core_id);
 
@@ -61,6 +64,10 @@ 
 static DEVICE_ATTR_RO(core_siblings);
 static DEVICE_ATTR_RO(core_siblings_list);
 
+define_siblings_show_func(cluster_cpus, cluster_cpumask);
+static DEVICE_ATTR_RO(cluster_cpus);
+static DEVICE_ATTR_RO(cluster_cpus_list);
+
 define_siblings_show_func(die_cpus, die_cpumask);
 static DEVICE_ATTR_RO(die_cpus);
 static DEVICE_ATTR_RO(die_cpus_list);
@@ -88,6 +95,7 @@ 
 static struct attribute *default_attrs[] = {
 	&dev_attr_physical_package_id.attr,
 	&dev_attr_die_id.attr,
+	&dev_attr_cluster_id.attr,
 	&dev_attr_core_id.attr,
 	&dev_attr_thread_siblings.attr,
 	&dev_attr_thread_siblings_list.attr,
@@ -95,6 +103,8 @@ 
 	&dev_attr_core_cpus_list.attr,
 	&dev_attr_core_siblings.attr,
 	&dev_attr_core_siblings_list.attr,
+	&dev_attr_cluster_cpus.attr,
+	&dev_attr_cluster_cpus_list.attr,
 	&dev_attr_die_cpus.attr,
 	&dev_attr_die_cpus_list.attr,
 	&dev_attr_package_cpus.attr,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9f43241..138b779 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1307,6 +1307,7 @@  static inline int lpit_read_residency_count_address(u64 *address)
 #ifdef CONFIG_ACPI_PPTT
 int acpi_pptt_cpu_is_thread(unsigned int cpu);
 int find_acpi_cpu_topology(unsigned int cpu, int level);
+int find_acpi_cpu_topology_cluster(unsigned int cpu);
 int find_acpi_cpu_topology_package(unsigned int cpu);
 int find_acpi_cpu_topology_hetero_id(unsigned int cpu);
 int find_acpi_cpu_cache_topology(unsigned int cpu, int level);
@@ -1319,6 +1320,10 @@  static inline int find_acpi_cpu_topology(unsigned int cpu, int level)
 {
 	return -EINVAL;
 }
+static inline int find_acpi_cpu_topology_cluster(unsigned int cpu)
+{
+	return -EINVAL;
+}
 static inline int find_acpi_cpu_topology_package(unsigned int cpu)
 {
 	return -EINVAL;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 0f6cd6b..987c7ea 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -49,10 +49,12 @@  void topology_set_thermal_pressure(const struct cpumask *cpus,
 struct cpu_topology {
 	int thread_id;
 	int core_id;
+	int cluster_id;
 	int package_id;
 	int llc_id;
 	cpumask_t thread_sibling;
 	cpumask_t core_sibling;
+	cpumask_t cluster_sibling;
 	cpumask_t llc_sibling;
 };
 
@@ -60,13 +62,16 @@  struct cpu_topology {
 extern struct cpu_topology cpu_topology[NR_CPUS];
 
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].package_id)
+#define topology_cluster_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
 #define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_cluster_cpumask(cpu)	(&cpu_topology[cpu].cluster_sibling)
 #define topology_llc_cpumask(cpu)	(&cpu_topology[cpu].llc_sibling)
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
 const struct cpumask *cpu_coregroup_mask(int cpu);
+const struct cpumask *cpu_clustergroup_mask(int cpu);
 void update_siblings_masks(unsigned int cpu);
 void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7634cd7..80d27d7 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -186,6 +186,9 @@  static inline int cpu_to_mem(int cpu)
 #ifndef topology_die_id
 #define topology_die_id(cpu)			((void)(cpu), -1)
 #endif
+#ifndef topology_cluster_id
+#define topology_cluster_id(cpu)		((void)(cpu), -1)
+#endif
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
@@ -195,6 +198,9 @@  static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
+#ifndef topology_cluster_cpumask
+#define topology_cluster_cpumask(cpu)		cpumask_of(cpu)
+#endif
 #ifndef topology_die_cpumask
 #define topology_die_cpumask(cpu)		cpumask_of(cpu)
 #endif