diff mbox

[v2,2/9] drivers: base: support cpu cache information interface to userspace via sysfs

Message ID 1406306692-7135-3-git-send-email-sudeep.holla@arm.com
State New
Headers show

Commit Message

Sudeep Holla July 25, 2014, 4:44 p.m. UTC
From: Sudeep Holla <sudeep.holla@arm.com>

This patch adds initial support for providing processor cache information
to userspace through sysfs interface. This is based on already existing
implementations(x86, ia64, s390 and powerpc) and hence the interface is
intended to be fully compatible.

The main purpose of this generic support is to avoid further code
duplication to support new architectures and also to unify all the existing
different implementations.

This implementation maintains the hierarchy of cache objects which reflects
the system's cache topology. Cache devices are instantiated as needed as
CPUs come online. The cache information is replicated per-cpu even if they are
shared. A per-cpu array of cache information maintained is used mainly for
sysfs-related book keeping.

It also implements the shared_cpu_map attribute, which is essential for
enabling both kernel and user-space to discover the system's overall cache
topology.

This patch also add the missing ABI documentation for the cacheinfo sysfs
interface already, which is well defined and widely used.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-doc@vger.kernel.org
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  41 ++
 drivers/base/Makefile                              |   2 +-
 drivers/base/cacheinfo.c                           | 539 +++++++++++++++++++++
 include/linux/cacheinfo.h                          |  73 +++
 4 files changed, 654 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/cacheinfo.c
 create mode 100644 include/linux/cacheinfo.h

Comments

Stephen Boyd July 29, 2014, 11:09 p.m. UTC | #1
On 07/25/14 09:44, Sudeep Holla wrote:
> diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
> index acb9bfc89b48..832b7f2ed6d2 100644
> --- a/Documentation/ABI/testing/sysfs-devices-system-cpu
> +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
> @@ -224,3 +224,44 @@ Description:	Parameters for the Intel P-state driver
>  		frequency range.
>  
>  		More details can be found in Documentation/cpu-freq/intel-pstate.txt
> +
> +What:		/sys/devices/system/cpu/cpu*/cache/index*/<set_of_attributes_mentioned_below>
> +Date:		July 2014(documented, existed before August 2008)
> +Contact:	Sudeep Holla <sudeep.holla@arm.com>
> +		Linux kernel mailing list <linux-kernel@vger.kernel.org>
> +Description:	Parameters for the CPU cache attributes
> +
> +		attributes:
> +			- writethrough: data is written to both the cache line
> +					and to the block in the lower-level memory
> +			- writeback: data is written only to the cache line and
> +				     the modified cache line is written to main
> +				     memory only when it is replaced
> +			- writeallocate: allocate a memory location to a cache line
> +					 on a cache miss because of a write
> +			- readallocate: allocate a memory location to a cache line
> +					on a cache miss because of a read
> +
> +		coherency_line_size: the minimum amount of data that gets transferred

Is this in bytes?

> +
> +		level: the cache hierarcy in the multi-level cache configuration
> +
> +		number_of_sets: total number of sets in the cache, a set is a
> +				collection of cache lines with the same cache index
> +
> +		physical_line_partition: number of physical cache line per cache tag
> +
> +		shared_cpu_list: the list of cpus sharing the cache

This is a logical list, not physical right?

> +
> +		shared_cpu_map: logical cpu mask containing the list of cpus sharing
> +				the cache
> +
> +		size: the total cache size in kB
> +
> +		type:
> +			- instruction: cache that only holds instructions
> +			- data: cache that only caches data
> +			- unified: cache that holds both data and instructions
> +
> +		ways_of_associativity: degree of freedom in placing a particular block
> +					of memory in the cache
> diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
> new file mode 100644
> index 000000000000..983728a919ec
> --- /dev/null
> +++ b/drivers/base/cacheinfo.c
> @@ -0,0 +1,539 @@
[...]
> +
> +static int detect_cache_attributes(unsigned int cpu)

Unused if sysfs is disabled? Actually it looks like everything except
the weak functions are unused in such a case.

> +static ssize_t shared_cpumap_show_func(struct device *dev, int type, char *buf)
> +{
> +	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
> +	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
> +	int n = 0;
> +
> +	if (len > 1) {
> +		const struct cpumask *mask = &this_leaf->shared_cpu_map;
> +
> +		n = type ? cpulist_scnprintf(buf, len - 2, mask) :
> +			   cpumask_scnprintf(buf, len - 2, mask);
> +		buf[n++] = '\n';
> +		buf[n] = '\0';
> +	}
> +	return n;
> +}

This looks to be lifted from show_cpumap() (well maybe that function was
lifted from x86, not sure). Perhaps it should be extracted out into an
inline function in cpumask.h? Future work I guess.

> +
> +static ssize_t shared_cpu_map_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	return shared_cpumap_show_func(dev, 0, buf);
> +}
> +
> +static ssize_t shared_cpu_list_show(struct device *dev,
> +				    struct device_attribute *attr, char *buf)
> +{
> +	return shared_cpumap_show_func(dev, 1, buf);
> +}
> +
> +static ssize_t type_show(struct device *dev,
> +			 struct device_attribute *attr, char *buf)
> +{
> +	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
> +
> +	switch (this_leaf->type) {
> +	case CACHE_TYPE_DATA:
> +		return sprintf(buf, "Data\n");
> +	case CACHE_TYPE_INST:
> +		return sprintf(buf, "Instruction\n");
> +	case CACHE_TYPE_UNIFIED:
> +		return sprintf(buf, "Unified\n");
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static ssize_t attributes_show(struct device *dev,
> +			       struct device_attribute *attr, char *buf)
> +{
> +	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
> +	unsigned int ci_attr = this_leaf->attributes;
> +	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
> +	int n = 0;
> +
> +	if (!ci_attr)
> +		return -EINVAL;
> +
> +	if (ci_attr & CACHE_WRITE_THROUGH)
> +		n += snprintf(buf + n, len - n, "WriteThrough\n");
> +	if (ci_attr & CACHE_WRITE_BACK)
> +		n += snprintf(buf + n, len - n, "WriteBack\n");
> +	if (ci_attr & CACHE_READ_ALLOCATE)
> +		n += snprintf(buf + n, len - n, "ReadAllocate\n");
> +	if (ci_attr & CACHE_WRITE_ALLOCATE)
> +		n += snprintf(buf + n, len - n, "WriteAllocate\n");

I see that ia64 has this attributes file, but in that case only two
attributes exist (write through and write back) and only one value is
ever shown. When we have multiple attributes we'll have multiple lines
to parse here. What if we left attributes around for the ia64 case
(possibly even hiding that entirely within that architecture specific
code) and then have files like "allocation_policy" and "storage_method"
that correspond to whether its read/write allocation and write through
or write back? The goal being to make only one value exist in any sysfs
attribute.

> +	buf[n] = '\0';
> +	return n;
> +}
> +
> +static umode_t
> +cache_default_attrs_is_visible(struct kobject *kobj,
> +			       struct attribute *attr, int unused)
> +{
> +	struct device *dev = kobj_to_dev(kobj);
> +	struct device_attribute *dev_attr;
> +	umode_t mode = attr->mode;
> +	char *buf;
> +
> +	dev_attr = container_of(attr, struct device_attribute, attr);
> +	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!buf)
> +		return 0;
> +
> +	/* create attributes that provides meaningful value */
> +	if (dev_attr->show && dev_attr->show(dev, dev_attr, buf) < 0)
> +		mode = 0;
> +
> +	kfree(buf);

This is sort of sad. We have to allocate a whole page and call the show
function to figure out if the attribute is visible? Why don't we
actually look at what the attribute is and check for the structure
members we care about? It looks like there are only a few combinations.

> +	return mode;
> +}
> +
> +static DEVICE_ATTR_RO(level);
> +static DEVICE_ATTR_RO(type);
> +static DEVICE_ATTR_RO(coherency_line_size);
> +static DEVICE_ATTR_RO(ways_of_associativity);
> +static DEVICE_ATTR_RO(number_of_sets);
> +static DEVICE_ATTR_RO(size);
> +static DEVICE_ATTR_RO(attributes);
> +static DEVICE_ATTR_RO(shared_cpu_map);
> +static DEVICE_ATTR_RO(shared_cpu_list);
> +static DEVICE_ATTR_RO(physical_line_partition);
> +
> +static struct attribute *cache_default_attrs[] = {
> +	&dev_attr_type.attr,
> +	&dev_attr_level.attr,
> +	&dev_attr_shared_cpu_map.attr,
> +	&dev_attr_shared_cpu_list.attr,
> +	&dev_attr_coherency_line_size.attr,
> +	&dev_attr_ways_of_associativity.attr,
> +	&dev_attr_number_of_sets.attr,
> +	&dev_attr_size.attr,
> +	&dev_attr_attributes.attr,
> +	&dev_attr_physical_line_partition.attr,
> +	NULL
> +};
> +
> +static const struct attribute_group cache_default_group = {
> +	.attrs = cache_default_attrs,
> +	.is_visible = cache_default_attrs_is_visible,
> +};
> +
> +static const struct attribute_group *cache_default_groups[] = {
> +	&cache_default_group,
> +	NULL,
> +};
> +
> +static struct attribute_group cache_private_group = {
> +	.is_visible = cache_default_attrs_is_visible,
> +};
> +
> +static const struct attribute_group *cache_private_groups[] = {
> +	&cache_default_group,
> +	(const struct attribute_group *)&cache_private_group,
> +	NULL,
> +};
> +
> +const struct attribute **
> +__weak cache_get_priv_attr(struct cacheinfo *this_leaf)
> +{
> +	return NULL;
> +}
> +
> +static const struct attribute_group **
> +cache_get_attribute_groups(struct cacheinfo *this_leaf)
> +{
> +	const struct attribute **priv_attr = cache_get_priv_attr(this_leaf);
> +
> +	if (!priv_attr)
> +		return cache_default_groups;
> +
> +	if (!cache_private_group.attrs)
> +		cache_private_group.attrs = (struct attribute **)priv_attr;
> +
> +	return cache_private_groups;
> +}

There's lots of odd casting going on here. Can we change
cache_get_priv_attr() to cache_get_priv_attr_group() and then rework x86
code to return a group instead of a bunch of attribute pointers? Then
drop const from cache_default_groups, add an extra blank NULL entry for
the possible arch specific groups and then use ARRAY_SIZE() - 1 to set
that entry with whatever we get from cache_get_priv_attr_group(). Also
drop the whole cache_private_groups thing.

> +
> +/* Add/Remove cache interface for CPU device */
> +static void cpu_cache_sysfs_exit(unsigned int cpu)
> +{
> +	int i;
> +	struct device *ci_dev;
> +
> +	if (per_cpu_index_dev(cpu)) {
> +		for (i = 0; i < cache_leaves(cpu); i++) {
> +			ci_dev = per_cache_index_dev(cpu, i);
> +			if (!ci_dev)
> +				continue;
> +			device_unregister(ci_dev);
> +		}
> +		kfree(per_cpu_index_dev(cpu));
> +		per_cpu_index_dev(cpu) = NULL;
> +	}
> +	device_unregister(per_cpu_cache_dev(cpu));
> +	per_cpu_cache_dev(cpu) = NULL;
> +}
> +
> +static int cpu_cache_sysfs_init(unsigned int cpu)
> +{
> +	struct device *dev = get_cpu_device(cpu);
> +
> +	if (per_cpu_cacheinfo(cpu) == NULL)
> +		return -ENOENT;
> +
> +	per_cpu_cache_dev(cpu) = device_create(dev->class, dev, cpu,
> +					       NULL, "cache");
> +	if (IS_ERR_OR_NULL(per_cpu_cache_dev(cpu)))
> +		return PTR_ERR(per_cpu_cache_dev(cpu));
> +
> +	/* Allocate all required memory */
> +	per_cpu_index_dev(cpu) = kcalloc(cache_leaves(cpu),
> +					 sizeof(struct device *), GFP_KERNEL);
> +	if (unlikely(per_cpu_index_dev(cpu) == NULL))
> +		goto err_out;
> +
> +	return 0;
> +
> +err_out:
> +	cpu_cache_sysfs_exit(cpu);
> +	return -ENOMEM;
> +}
> +
> +static int cache_add_dev(unsigned int cpu)
> +{
> +	unsigned short i;

unsigned int?

> +	int rc;
> +	struct device *ci_dev, *parent;
> +	struct cacheinfo *this_leaf;
> +	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
> +	const struct attribute_group **cache_groups;
> +
> +	rc = cpu_cache_sysfs_init(cpu);
> +	if (unlikely(rc < 0))
> +		return rc;
> +
> +	parent = per_cpu_cache_dev(cpu);
> +	for (i = 0; i < cache_leaves(cpu); i++) {
> +		this_leaf = this_cpu_ci->info_list + i;
> +		if (this_leaf->disable_sysfs)
> +			continue;
> +		cache_groups = cache_get_attribute_groups(this_leaf);
> +		ci_dev = device_create_with_groups(parent->class, parent, i,
> +						   this_leaf, cache_groups,
> +						   "index%1u", i);
> +		if (IS_ERR_OR_NULL(ci_dev)) {

IS_ERR?

> +			rc = PTR_ERR(ci_dev);
> +			goto err;
> +		}
> +		per_cache_index_dev(cpu, i) = ci_dev;
> +	}
> +	cpumask_set_cpu(cpu, &cache_dev_map);
> +
> +	return 0;
> +err:
> +	cpu_cache_sysfs_exit(cpu);
> +	return rc;
> +}
> +
> +static void cache_remove_dev(unsigned int cpu)
> +{
> +	if (!cpumask_test_cpu(cpu, &cache_dev_map))
> +		return;
> +	cpumask_clear_cpu(cpu, &cache_dev_map);
> +
> +	cpu_cache_sysfs_exit(cpu);
> +}
> +
> +static int cacheinfo_cpu_callback(struct notifier_block *nfb,
> +				  unsigned long action, void *hcpu)
> +{
> +	unsigned int cpu = (unsigned long)hcpu;
> +	int rc = 0;
> +
> +	switch (action) {

Looks like we can do action & ~CPU_TASKS_FROZEN to save two lines here

> +	case CPU_ONLINE:
> +	case CPU_ONLINE_FROZEN:
> +		rc = detect_cache_attributes(cpu);
> +		if (!rc)
> +			rc = cache_add_dev(cpu);
> +		break;
> +	case CPU_DEAD:
> +	case CPU_DEAD_FROZEN:
> +		cache_remove_dev(cpu);
> +		if (per_cpu_cacheinfo(cpu))
> +			free_cache_attributes(cpu);
> +		break;
> +	}
> +	return notifier_from_errno(rc);
> +}

Hm... adding/detecting/destroying this stuff every time a CPU is
logically hotplugged seems like a waste of time and energy. Why can't we
only do this work when the CPU is actually physically removed? The path
for that is via the subsys_interface and it would make it easier on
programs that want to learn about cache info as long as the CPU is
present in the system even if it isn't online at the time of reading.

> +
> +static int __init cacheinfo_sysfs_init(void)
> +{
> +	int cpu, rc = 0;
> +
> +	cpu_notifier_register_begin();
> +
> +	for_each_online_cpu(cpu) {
> +		rc = detect_cache_attributes(cpu);
> +		if (rc) {
> +			pr_err("error detecting cacheinfo..cpu%d\n", cpu);
> +			goto out;
> +		}
> +		rc = cache_add_dev(cpu);
> +		if (rc) {
> +			free_cache_attributes(cpu);
> +			pr_err("error populating cacheinfo..cpu%d\n", cpu);
> +			goto out;
> +		}
> +	}
> +	__hotcpu_notifier(cacheinfo_cpu_callback, 0);
> +
> +out:
> +	cpu_notifier_register_done();
> +	return rc;
> +}
> +
> +device_initcall(cacheinfo_sysfs_init);
> +
> +#endif	/* CONFIG_SYSFS */
> diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
> new file mode 100644
> index 000000000000..3d67b4910aa8
> --- /dev/null
> +++ b/include/linux/cacheinfo.h
> @@ -0,0 +1,73 @@
> +#ifndef _LINUX_CACHEINFO_H
> +#define _LINUX_CACHEINFO_H
> +
> +#include <linux/bitops.h>
> +#include <linux/compiler.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/of.h>
> +#include <linux/sysfs.h>

These last three look like they could be removed and we could have
forward declarations for required types. What's compiler.h for?

> +
> +enum cache_type {
> +	CACHE_TYPE_NOCACHE = 0,
> +	CACHE_TYPE_INST = BIT(0),
> +	CACHE_TYPE_DATA = BIT(1),
> +	CACHE_TYPE_SEPARATE = CACHE_TYPE_INST | CACHE_TYPE_DATA,
> +	CACHE_TYPE_UNIFIED = BIT(2),
> +};
> +
> +struct cacheinfo {
> +	/* core properties */
> +	enum cache_type type; /* data, inst or unified */
> +	unsigned int level;
> +	unsigned int coherency_line_size; /* cache line size  */
> +	unsigned int number_of_sets; /* no. of sets per way */
> +	unsigned int ways_of_associativity; /* no. of ways */
> +	unsigned int physical_line_partition; /* no. of lines per tag */
> +	unsigned int size; /* total cache size */
> +	cpumask_t shared_cpu_map;
> +	unsigned int attributes;
> +#define CACHE_WRITE_THROUGH	BIT(0)
> +#define CACHE_WRITE_BACK	BIT(1)
> +#define CACHE_READ_ALLOCATE	BIT(2)
> +#define CACHE_WRITE_ALLOCATE	BIT(3)
> +
> +	/* book keeping */
> +	struct device_node *of_node;	/* cpu if no explicit cache node */
> +	bool disable_sysfs; /* don't expose this leaf through sysfs */
> +	void *priv;
> +};

Can we use real kernel doc notation here please?

> +
> +struct cpu_cacheinfo {
> +	struct cacheinfo *info_list;
> +	unsigned int num_levels;
> +	unsigned int num_leaves;
> +};
> +
> +/*
> + * Helpers to make sure "func" is executed on the cpu whose cache
> + * attributes are being detected
> + */
> +#define DEFINE_SMP_CALL_FUNCTION(func)				\
> +static void _##func(void *ret)					\
> +{								\
> +	int cpu = smp_processor_id();				\
> +	*(int *)ret = __##func(cpu);				\
> +}								\
> +								\
> +int func(unsigned int cpu)					\
> +{								\
> +	int ret;						\
> +	smp_call_function_single(cpu, _##func, &ret, true);	\
> +	return ret;						\
> +}
> +

Bikeshed: Maybe this should be called DEFINE_SMP_CALL_CACHE_FUNCTION()?
Missing include for <linux/smp.h> here?
Sudeep Holla July 30, 2014, 4:23 p.m. UTC | #2
Hi Stephen,

Thanks for reviewing this.

On 30/07/14 00:09, Stephen Boyd wrote:
> On 07/25/14 09:44, Sudeep Holla wrote:
>> diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
>> index acb9bfc89b48..832b7f2ed6d2 100644
>> --- a/Documentation/ABI/testing/sysfs-devices-system-cpu
>> +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
>> @@ -224,3 +224,44 @@ Description:     Parameters for the Intel P-state driver
>>                frequency range.
>>
>>                More details can be found in Documentation/cpu-freq/intel-pstate.txt
>> +
>> +What:                /sys/devices/system/cpu/cpu*/cache/index*/<set_of_attributes_mentioned_below>
>> +Date:                July 2014(documented, existed before August 2008)
>> +Contact:     Sudeep Holla <sudeep.holla@arm.com>
>> +             Linux kernel mailing list <linux-kernel@vger.kernel.org>
>> +Description: Parameters for the CPU cache attributes
>> +
>> +             attributes:
>> +                     - writethrough: data is written to both the cache line
>> +                                     and to the block in the lower-level memory
>> +                     - writeback: data is written only to the cache line and
>> +                                  the modified cache line is written to main
>> +                                  memory only when it is replaced
>> +                     - writeallocate: allocate a memory location to a cache line
>> +                                      on a cache miss because of a write
>> +                     - readallocate: allocate a memory location to a cache line
>> +                                     on a cache miss because of a read
>> +
>> +             coherency_line_size: the minimum amount of data that gets transferred
>
> Is this in bytes?
>

Right, fixed now.

>> +
>> +             level: the cache hierarcy in the multi-level cache configuration
>> +
>> +             number_of_sets: total number of sets in the cache, a set is a
>> +                             collection of cache lines with the same cache index
>> +
>> +             physical_line_partition: number of physical cache line per cache tag
>> +
>> +             shared_cpu_list: the list of cpus sharing the cache
>
> This is a logical list, not physical right?
>

Right, done.

>> +
>> +             shared_cpu_map: logical cpu mask containing the list of cpus sharing
>> +                             the cache
>> +
>> +             size: the total cache size in kB
>> +
>> +             type:
>> +                     - instruction: cache that only holds instructions
>> +                     - data: cache that only caches data
>> +                     - unified: cache that holds both data and instructions
>> +
>> +             ways_of_associativity: degree of freedom in placing a particular block
>> +                                     of memory in the cache
>> diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
>> new file mode 100644
>> index 000000000000..983728a919ec
>> --- /dev/null
>> +++ b/drivers/base/cacheinfo.c
>> @@ -0,0 +1,539 @@
> [...]
>> +
>> +static int detect_cache_attributes(unsigned int cpu)
>
> Unused if sysfs is disabled? Actually it looks like everything except
> the weak functions are unused in such a case.
>



>> +static ssize_t shared_cpumap_show_func(struct device *dev, int type, char *buf)
>> +{
>> +     struct cacheinfo *this_leaf = dev_get_drvdata(dev);
>> +     ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
>> +     int n = 0;
>> +
>> +     if (len > 1) {
>> +             const struct cpumask *mask = &this_leaf->shared_cpu_map;
>> +
>> +             n = type ? cpulist_scnprintf(buf, len - 2, mask) :
>> +                        cpumask_scnprintf(buf, len - 2, mask);
>> +             buf[n++] = '\n';
>> +             buf[n] = '\0';
>> +     }
>> +     return n;
>> +}
>
> This looks to be lifted from show_cpumap() (well maybe that function was
> lifted from x86, not sure). Perhaps it should be extracted out into an
> inline function in cpumask.h? Future work I guess.
>

Right looks like this is copied from drivers/base/topology.c, will move to
cpumask.h

>> +
>> +static ssize_t shared_cpu_map_show(struct device *dev,
>> +                                struct device_attribute *attr, char *buf)
>> +{
>> +     return shared_cpumap_show_func(dev, 0, buf);
>> +}
>> +
>> +static ssize_t shared_cpu_list_show(struct device *dev,
>> +                                 struct device_attribute *attr, char *buf)
>> +{
>> +     return shared_cpumap_show_func(dev, 1, buf);
>> +}
>> +
>> +static ssize_t type_show(struct device *dev,
>> +                      struct device_attribute *attr, char *buf)
>> +{
>> +     struct cacheinfo *this_leaf = dev_get_drvdata(dev);
>> +
>> +     switch (this_leaf->type) {
>> +     case CACHE_TYPE_DATA:
>> +             return sprintf(buf, "Data\n");
>> +     case CACHE_TYPE_INST:
>> +             return sprintf(buf, "Instruction\n");
>> +     case CACHE_TYPE_UNIFIED:
>> +             return sprintf(buf, "Unified\n");
>> +     default:
>> +             return -EINVAL;
>> +     }
>> +}
>> +
>> +static ssize_t attributes_show(struct device *dev,
>> +                            struct device_attribute *attr, char *buf)
>> +{
>> +     struct cacheinfo *this_leaf = dev_get_drvdata(dev);
>> +     unsigned int ci_attr = this_leaf->attributes;
>> +     ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
>> +     int n = 0;
>> +
>> +     if (!ci_attr)
>> +             return -EINVAL;
>> +
>> +     if (ci_attr & CACHE_WRITE_THROUGH)
>> +             n += snprintf(buf + n, len - n, "WriteThrough\n");
>> +     if (ci_attr & CACHE_WRITE_BACK)
>> +             n += snprintf(buf + n, len - n, "WriteBack\n");
>> +     if (ci_attr & CACHE_READ_ALLOCATE)
>> +             n += snprintf(buf + n, len - n, "ReadAllocate\n");
>> +     if (ci_attr & CACHE_WRITE_ALLOCATE)
>> +             n += snprintf(buf + n, len - n, "WriteAllocate\n");
>
> I see that ia64 has this attributes file, but in that case only two
> attributes exist (write through and write back) and only one value is
> ever shown. When we have multiple attributes we'll have multiple lines
> to parse here. What if we left attributes around for the ia64 case
> (possibly even hiding that entirely within that architecture specific
> code) and then have files like "allocation_policy" and "storage_method"
> that correspond to whether its read/write allocation and write through
> or write back? The goal being to make only one value exist in any sysfs
> attribute.
>

I like your idea, but is it hard rule to have only one value in any
sysfs attribute ? Though one concern I have is if different cache designs
make have different features and like to express that, 'attributes' is a
unified place to do that similar to cpu features in /proc/cpuinfo.

Anyways if we decide to split it, how about write_policy instead of
storage_method ?

>> +     buf[n] = '\0';
>> +     return n;
>> +}
>> +
>> +static umode_t
>> +cache_default_attrs_is_visible(struct kobject *kobj,
>> +                            struct attribute *attr, int unused)
>> +{
>> +     struct device *dev = kobj_to_dev(kobj);
>> +     struct device_attribute *dev_attr;
>> +     umode_t mode = attr->mode;
>> +     char *buf;
>> +
>> +     dev_attr = container_of(attr, struct device_attribute, attr);
>> +     buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>> +     if (!buf)
>> +             return 0;
>> +
>> +     /* create attributes that provides meaningful value */
>> +     if (dev_attr->show && dev_attr->show(dev, dev_attr, buf) < 0)
>> +             mode = 0;
>> +
>> +     kfree(buf);
>
> This is sort of sad. We have to allocate a whole page and call the show
> function to figure out if the attribute is visible? Why don't we
> actually look at what the attribute is and check for the structure
> members we care about? It looks like there are only a few combinations.
>

Yes I thought about that, as even I didn't like that allocation. But if
we want the private attributes also use the same is_visible callback, we
can't check member directly as we don't know the details of the
individual element.

Even if we have compare elements we need to compare the attribute and
then the value for each element in the structure, requiring changes if
elements are added/removed. I am fine either way, just explaining why
it's done so.

>> +     return mode;
>> +}
>> +
>> +static DEVICE_ATTR_RO(level);
>> +static DEVICE_ATTR_RO(type);
>> +static DEVICE_ATTR_RO(coherency_line_size);
>> +static DEVICE_ATTR_RO(ways_of_associativity);
>> +static DEVICE_ATTR_RO(number_of_sets);
>> +static DEVICE_ATTR_RO(size);
>> +static DEVICE_ATTR_RO(attributes);
>> +static DEVICE_ATTR_RO(shared_cpu_map);
>> +static DEVICE_ATTR_RO(shared_cpu_list);
>> +static DEVICE_ATTR_RO(physical_line_partition);
>> +
>> +static struct attribute *cache_default_attrs[] = {
>> +     &dev_attr_type.attr,
>> +     &dev_attr_level.attr,
>> +     &dev_attr_shared_cpu_map.attr,
>> +     &dev_attr_shared_cpu_list.attr,
>> +     &dev_attr_coherency_line_size.attr,
>> +     &dev_attr_ways_of_associativity.attr,
>> +     &dev_attr_number_of_sets.attr,
>> +     &dev_attr_size.attr,
>> +     &dev_attr_attributes.attr,
>> +     &dev_attr_physical_line_partition.attr,
>> +     NULL
>> +};
>> +
>> +static const struct attribute_group cache_default_group = {
>> +     .attrs = cache_default_attrs,
>> +     .is_visible = cache_default_attrs_is_visible,
>> +};
>> +
>> +static const struct attribute_group *cache_default_groups[] = {
>> +     &cache_default_group,
>> +     NULL,
>> +};
>> +
>> +static struct attribute_group cache_private_group = {
>> +     .is_visible = cache_default_attrs_is_visible,
>> +};
>> +
>> +static const struct attribute_group *cache_private_groups[] = {
>> +     &cache_default_group,
>> +     (const struct attribute_group *)&cache_private_group,
>> +     NULL,
>> +};
>> +
>> +const struct attribute **
>> +__weak cache_get_priv_attr(struct cacheinfo *this_leaf)
>> +{
>> +     return NULL;
>> +}
>> +
>> +static const struct attribute_group **
>> +cache_get_attribute_groups(struct cacheinfo *this_leaf)
>> +{
>> +     const struct attribute **priv_attr = cache_get_priv_attr(this_leaf);
>> +
>> +     if (!priv_attr)
>> +             return cache_default_groups;
>> +
>> +     if (!cache_private_group.attrs)
>> +             cache_private_group.attrs = (struct attribute **)priv_attr;
>> +
>> +     return cache_private_groups;
>> +}
>
> There's lots of odd casting going on here. Can we change
> cache_get_priv_attr() to cache_get_priv_attr_group() and then rework x86
> code to return a group instead of a bunch of attribute pointers? Then
> drop const from cache_default_groups, add an extra blank NULL entry for
> the possible arch specific groups and then use ARRAY_SIZE() - 1 to set
> that entry with whatever we get from cache_get_priv_attr_group(). Also
> drop the whole cache_private_groups thing.
>

I initially had exactly same. I later changed to this mainly to avoid:
1. is_visible duplication
2. run-time updates to cache_groups for each node

No strong opinion again, I am fine either way. Let's see what Greg has
to say.

>> +
>> +/* Add/Remove cache interface for CPU device */
>> +static void cpu_cache_sysfs_exit(unsigned int cpu)
>> +{
>> +     int i;
>> +     struct device *ci_dev;
>> +
>> +     if (per_cpu_index_dev(cpu)) {
>> +             for (i = 0; i < cache_leaves(cpu); i++) {
>> +                     ci_dev = per_cache_index_dev(cpu, i);
>> +                     if (!ci_dev)
>> +                             continue;
>> +                     device_unregister(ci_dev);
>> +             }
>> +             kfree(per_cpu_index_dev(cpu));
>> +             per_cpu_index_dev(cpu) = NULL;
>> +     }
>> +     device_unregister(per_cpu_cache_dev(cpu));
>> +     per_cpu_cache_dev(cpu) = NULL;
>> +}
>> +
>> +static int cpu_cache_sysfs_init(unsigned int cpu)
>> +{
>> +     struct device *dev = get_cpu_device(cpu);
>> +
>> +     if (per_cpu_cacheinfo(cpu) == NULL)
>> +             return -ENOENT;
>> +
>> +     per_cpu_cache_dev(cpu) = device_create(dev->class, dev, cpu,
>> +                                            NULL, "cache");
>> +     if (IS_ERR_OR_NULL(per_cpu_cache_dev(cpu)))
>> +             return PTR_ERR(per_cpu_cache_dev(cpu));
>> +
>> +     /* Allocate all required memory */
>> +     per_cpu_index_dev(cpu) = kcalloc(cache_leaves(cpu),
>> +                                      sizeof(struct device *), GFP_KERNEL);
>> +     if (unlikely(per_cpu_index_dev(cpu) == NULL))
>> +             goto err_out;
>> +
>> +     return 0;
>> +
>> +err_out:
>> +     cpu_cache_sysfs_exit(cpu);
>> +     return -ENOMEM;
>> +}
>> +
>> +static int cache_add_dev(unsigned int cpu)
>> +{
>> +     unsigned short i;
>
> unsigned int?
>

Right, left over from early developments when num_leaves were short.

>> +     int rc;
>> +     struct device *ci_dev, *parent;
>> +     struct cacheinfo *this_leaf;
>> +     struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
>> +     const struct attribute_group **cache_groups;
>> +
>> +     rc = cpu_cache_sysfs_init(cpu);
>> +     if (unlikely(rc < 0))
>> +             return rc;
>> +
>> +     parent = per_cpu_cache_dev(cpu);
>> +     for (i = 0; i < cache_leaves(cpu); i++) {
>> +             this_leaf = this_cpu_ci->info_list + i;
>> +             if (this_leaf->disable_sysfs)
>> +                     continue;
>> +             cache_groups = cache_get_attribute_groups(this_leaf);
>> +             ci_dev = device_create_with_groups(parent->class, parent, i,
>> +                                                this_leaf, cache_groups,
>> +                                                "index%1u", i);
>> +             if (IS_ERR_OR_NULL(ci_dev)) {
>
> IS_ERR?
>

Right, done.

>> +                     rc = PTR_ERR(ci_dev);
>> +                     goto err;
>> +             }
>> +             per_cache_index_dev(cpu, i) = ci_dev;
>> +     }
>> +     cpumask_set_cpu(cpu, &cache_dev_map);
>> +
>> +     return 0;
>> +err:
>> +     cpu_cache_sysfs_exit(cpu);
>> +     return rc;
>> +}
>> +
>> +static void cache_remove_dev(unsigned int cpu)
>> +{
>> +     if (!cpumask_test_cpu(cpu, &cache_dev_map))
>> +             return;
>> +     cpumask_clear_cpu(cpu, &cache_dev_map);
>> +
>> +     cpu_cache_sysfs_exit(cpu);
>> +}
>> +
>> +static int cacheinfo_cpu_callback(struct notifier_block *nfb,
>> +                               unsigned long action, void *hcpu)
>> +{
>> +     unsigned int cpu = (unsigned long)hcpu;
>> +     int rc = 0;
>> +
>> +     switch (action) {
>
> Looks like we can do action & ~CPU_TASKS_FROZEN to save two lines here
>

Done.

>> +     case CPU_ONLINE:
>> +     case CPU_ONLINE_FROZEN:
>> +             rc = detect_cache_attributes(cpu);
>> +             if (!rc)
>> +                     rc = cache_add_dev(cpu);
>> +             break;
>> +     case CPU_DEAD:
>> +     case CPU_DEAD_FROZEN:
>> +             cache_remove_dev(cpu);
>> +             if (per_cpu_cacheinfo(cpu))
>> +                     free_cache_attributes(cpu);
>> +             break;
>> +     }
>> +     return notifier_from_errno(rc);
>> +}
>
> Hm... adding/detecting/destroying this stuff every time a CPU is
> logically hotplugged seems like a waste of time and energy. Why can't we
> only do this work when the CPU is actually physically removed? The path
> for that is via the subsys_interface and it would make it easier on
> programs that want to learn about cache info as long as the CPU is
> present in the system even if it isn't online at the time of reading.
>

I agree, but the main reason I retained it as most of the existing
architectures implement this way and I didn't want tho change that
behaviour.

>> +
>> +static int __init cacheinfo_sysfs_init(void)
>> +{
>> +     int cpu, rc = 0;
>> +
>> +     cpu_notifier_register_begin();
>> +
>> +     for_each_online_cpu(cpu) {
>> +             rc = detect_cache_attributes(cpu);
>> +             if (rc) {
>> +                     pr_err("error detecting cacheinfo..cpu%d\n", cpu);
>> +                     goto out;
>> +             }
>> +             rc = cache_add_dev(cpu);
>> +             if (rc) {
>> +                     free_cache_attributes(cpu);
>> +                     pr_err("error populating cacheinfo..cpu%d\n", cpu);
>> +                     goto out;
>> +             }
>> +     }
>> +     __hotcpu_notifier(cacheinfo_cpu_callback, 0);
>> +
>> +out:
>> +     cpu_notifier_register_done();
>> +     return rc;
>> +}
>> +
>> +device_initcall(cacheinfo_sysfs_init);
>> +
>> +#endif       /* CONFIG_SYSFS */
>> diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
>> new file mode 100644
>> index 000000000000..3d67b4910aa8
>> --- /dev/null
>> +++ b/include/linux/cacheinfo.h
>> @@ -0,0 +1,73 @@
>> +#ifndef _LINUX_CACHEINFO_H
>> +#define _LINUX_CACHEINFO_H
>> +
>> +#include <linux/bitops.h>
>> +#include <linux/compiler.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/device.h>
>> +#include <linux/of.h>
>> +#include <linux/sysfs.h>
>
> These last three look like they could be removed and we could have
> forward declarations for required types. What's compiler.h for?
>

IIRC I added it for inline attribute, but looks like I dropped inline :(

>> +
>> +enum cache_type {
>> +     CACHE_TYPE_NOCACHE = 0,
>> +     CACHE_TYPE_INST = BIT(0),
>> +     CACHE_TYPE_DATA = BIT(1),
>> +     CACHE_TYPE_SEPARATE = CACHE_TYPE_INST | CACHE_TYPE_DATA,
>> +     CACHE_TYPE_UNIFIED = BIT(2),
>> +};
>> +
>> +struct cacheinfo {
>> +     /* core properties */
>> +     enum cache_type type; /* data, inst or unified */
>> +     unsigned int level;
>> +     unsigned int coherency_line_size; /* cache line size  */
>> +     unsigned int number_of_sets; /* no. of sets per way */
>> +     unsigned int ways_of_associativity; /* no. of ways */
>> +     unsigned int physical_line_partition; /* no. of lines per tag */
>> +     unsigned int size; /* total cache size */
>> +     cpumask_t shared_cpu_map;
>> +     unsigned int attributes;
>> +#define CACHE_WRITE_THROUGH  BIT(0)
>> +#define CACHE_WRITE_BACK     BIT(1)
>> +#define CACHE_READ_ALLOCATE  BIT(2)
>> +#define CACHE_WRITE_ALLOCATE BIT(3)
>> +
>> +     /* book keeping */
>> +     struct device_node *of_node;    /* cpu if no explicit cache node */
>> +     bool disable_sysfs; /* don't expose this leaf through sysfs */
>> +     void *priv;
>> +};
>
> Can we use real kernel doc notation here please?
>

OK

>> +
>> +struct cpu_cacheinfo {
>> +     struct cacheinfo *info_list;
>> +     unsigned int num_levels;
>> +     unsigned int num_leaves;
>> +};
>> +
>> +/*
>> + * Helpers to make sure "func" is executed on the cpu whose cache
>> + * attributes are being detected
>> + */
>> +#define DEFINE_SMP_CALL_FUNCTION(func)                               \
>> +static void _##func(void *ret)                                       \
>> +{                                                            \
>> +     int cpu = smp_processor_id();                           \
>> +     *(int *)ret = __##func(cpu);                            \
>> +}                                                            \
>> +                                                             \
>> +int func(unsigned int cpu)                                   \
>> +{                                                            \
>> +     int ret;                                                \
>> +     smp_call_function_single(cpu, _##func, &ret, true);     \
>> +     return ret;                                             \
>> +}
>> +
>
> Bikeshed: Maybe this should be called DEFINE_SMP_CALL_CACHE_FUNCTION()?
> Missing include for <linux/smp.h> here?

Makes sense, will change.

>
> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Stephen Boyd July 31, 2014, 7:46 p.m. UTC | #3
On 07/30/14 09:23, Sudeep Holla wrote:
> Hi Stephen,
>
> Thanks for reviewing this.
>
> On 30/07/14 00:09, Stephen Boyd wrote:
>> On 07/25/14 09:44, Sudeep Holla wrote:
>
>>> +
>>> +             shared_cpu_map: logical cpu mask containing the list
>>> of cpus sharing
>>> +                             the cache
>>> +
>>> +             size: the total cache size in kB
>>> +
>>> +             type:
>>> +                     - instruction: cache that only holds instructions
>>> +                     - data: cache that only caches data
>>> +                     - unified: cache that holds both data and
>>> instructions
>>> +
>>> +             ways_of_associativity: degree of freedom in placing a
>>> particular block
>>> +                                     of memory in the cache
>>> diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
>>> new file mode 100644
>>> index 000000000000..983728a919ec
>>> --- /dev/null
>>> +++ b/drivers/base/cacheinfo.c
>>> @@ -0,0 +1,539 @@
>> [...]
>>> +
>>> +static int detect_cache_attributes(unsigned int cpu)
>>
>> Unused if sysfs is disabled? Actually it looks like everything except
>> the weak functions are unused in such a case.
>>
>
>> I see that ia64 has this attributes file, but in that case only two
>> attributes exist (write through and write back) and only one value is
>> ever shown. When we have multiple attributes we'll have multiple lines
>> to parse here. What if we left attributes around for the ia64 case
>> (possibly even hiding that entirely within that architecture specific
>> code) and then have files like "allocation_policy" and "storage_method"
>> that correspond to whether its read/write allocation and write through
>> or write back? The goal being to make only one value exist in any sysfs
>> attribute.
>>
>
> I like your idea, but is it hard rule to have only one value in any
> sysfs attribute ? Though one concern I have is if different cache designs
> make have different features and like to express that, 'attributes' is a
> unified place to do that similar to cpu features in /proc/cpuinfo.

'attributes' seems too generic. Pretty much anything is an attribute.

>
> Anyways if we decide to split it, how about write_policy instead of
> storage_method ?

Sounds good.

>
>>> +     buf[n] = '\0';
>>> +     return n;
>>> +}
>>> +
>>> +static umode_t
>>> +cache_default_attrs_is_visible(struct kobject *kobj,
>>> +                            struct attribute *attr, int unused)
>>> +{
>>> +     struct device *dev = kobj_to_dev(kobj);
>>> +     struct device_attribute *dev_attr;
>>> +     umode_t mode = attr->mode;
>>> +     char *buf;
>>> +
>>> +     dev_attr = container_of(attr, struct device_attribute, attr);
>>> +     buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>> +     if (!buf)
>>> +             return 0;
>>> +
>>> +     /* create attributes that provides meaningful value */
>>> +     if (dev_attr->show && dev_attr->show(dev, dev_attr, buf) < 0)
>>> +             mode = 0;
>>> +
>>> +     kfree(buf);
>>
>> This is sort of sad. We have to allocate a whole page and call the show
>> function to figure out if the attribute is visible? Why don't we
>> actually look at what the attribute is and check for the structure
>> members we care about? It looks like there are only a few combinations.
>>
>
> Yes I thought about that, as even I didn't like that allocation. But if
> we want the private attributes also use the same is_visible callback, we
> can't check member directly as we don't know the details of the
> individual element.
>
> Even if we have compare elements we need to compare the attribute and
> then the value for each element in the structure, requiring changes if
> elements are added/removed. I am fine either way, just explaining why
> it's done so.

Does any other sysfs attribute group do this? If it was desired I would
think someone else would have done this already, or we wouldn't have
even had an is_visible in the first place as this generic code would
replace it.

>
>
>>> +     case CPU_ONLINE:
>>> +     case CPU_ONLINE_FROZEN:
>>> +             rc = detect_cache_attributes(cpu);
>>> +             if (!rc)
>>> +                     rc = cache_add_dev(cpu);
>>> +             break;
>>> +     case CPU_DEAD:
>>> +     case CPU_DEAD_FROZEN:
>>> +             cache_remove_dev(cpu);
>>> +             if (per_cpu_cacheinfo(cpu))
>>> +                     free_cache_attributes(cpu);
>>> +             break;
>>> +     }
>>> +     return notifier_from_errno(rc);
>>> +}
>>
>> Hm... adding/detecting/destroying this stuff every time a CPU is
>> logically hotplugged seems like a waste of time and energy. Why can't we
>> only do this work when the CPU is actually physically removed? The path
>> for that is via the subsys_interface and it would make it easier on
>> programs that want to learn about cache info as long as the CPU is
>> present in the system even if it isn't online at the time of reading.
>>
>
> I agree, but the main reason I retained it as most of the existing
> architectures implement this way and I didn't want tho change that
> behaviour.

Would anything bad happen if we loosened the behavior so that the
directory is always present as long as the CPU is present? I doubt it.
Seems like a low risk change.
Sudeep Holla Aug. 5, 2014, 6:15 p.m. UTC | #4
Hi Stephen,

On 31/07/14 20:46, Stephen Boyd wrote:
> On 07/30/14 09:23, Sudeep Holla wrote:
>> Hi Stephen,
>>
>> Thanks for reviewing this.
>>
>> On 30/07/14 00:09, Stephen Boyd wrote:
>>> On 07/25/14 09:44, Sudeep Holla wrote:
>>
>>>> +
>>>> +             shared_cpu_map: logical cpu mask containing the list
>>>> of cpus sharing
>>>> +                             the cache
>>>> +
>>>> +             size: the total cache size in kB
>>>> +
>>>> +             type:
>>>> +                     - instruction: cache that only holds instructions
>>>> +                     - data: cache that only caches data
>>>> +                     - unified: cache that holds both data and
>>>> instructions
>>>> +
>>>> +             ways_of_associativity: degree of freedom in placing a
>>>> particular block
>>>> +                                     of memory in the cache
>>>> diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
>>>> new file mode 100644
>>>> index 000000000000..983728a919ec
>>>> --- /dev/null
>>>> +++ b/drivers/base/cacheinfo.c
>>>> @@ -0,0 +1,539 @@
>>> [...]
>>>> +
>>>> +static int detect_cache_attributes(unsigned int cpu)
>>>
>>> Unused if sysfs is disabled? Actually it looks like everything except
>>> the weak functions are unused in such a case.
>>>

I see that sysfs has dummy implementations, probably I can remove #ifdef

>>
>>> I see that ia64 has this attributes file, but in that case only two
>>> attributes exist (write through and write back) and only one value is
>>> ever shown. When we have multiple attributes we'll have multiple lines
>>> to parse here. What if we left attributes around for the ia64 case
>>> (possibly even hiding that entirely within that architecture specific
>>> code) and then have files like "allocation_policy" and "storage_method"
>>> that correspond to whether its read/write allocation and write through
>>> or write back? The goal being to make only one value exist in any sysfs
>>> attribute.
>>>
>>
>> I like your idea, but is it hard rule to have only one value in any
>> sysfs attribute ? Though one concern I have is if different cache designs
>> make have different features and like to express that, 'attributes' is a
>> unified place to do that similar to cpu features in /proc/cpuinfo.
>
> 'attributes' seems too generic. Pretty much anything is an attribute.
>

Yes I agree and hence I compared it to /proc/cpuinfo.
As I said I am fine with new single value sysfs, but my main concern is
the extendability. If we don't for-see any changes in near future, then
we can go with new files as you suggested.

>>
>> Anyways if we decide to split it, how about write_policy instead of
>> storage_method ?
>
> Sounds good.
>

Thanks.

>>
>>>> +     buf[n] = '\0';
>>>> +     return n;
>>>> +}
>>>> +
>>>> +static umode_t
>>>> +cache_default_attrs_is_visible(struct kobject *kobj,
>>>> +                            struct attribute *attr, int unused)
>>>> +{
>>>> +     struct device *dev = kobj_to_dev(kobj);
>>>> +     struct device_attribute *dev_attr;
>>>> +     umode_t mode = attr->mode;
>>>> +     char *buf;
>>>> +
>>>> +     dev_attr = container_of(attr, struct device_attribute, attr);
>>>> +     buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>>>> +     if (!buf)
>>>> +             return 0;
>>>> +
>>>> +     /* create attributes that provides meaningful value */
>>>> +     if (dev_attr->show && dev_attr->show(dev, dev_attr, buf) < 0)
>>>> +             mode = 0;
>>>> +
>>>> +     kfree(buf);
>>>
>>> This is sort of sad. We have to allocate a whole page and call the show
>>> function to figure out if the attribute is visible? Why don't we
>>> actually look at what the attribute is and check for the structure
>>> members we care about? It looks like there are only a few combinations.
>>>
>>
>> Yes I thought about that, as even I didn't like that allocation. But if
>> we want the private attributes also use the same is_visible callback, we
>> can't check member directly as we don't know the details of the
>> individual element.
>>
>> Even if we have compare elements we need to compare the attribute and
>> then the value for each element in the structure, requiring changes if
>> elements are added/removed. I am fine either way, just explaining why
>> it's done so.
>
> Does any other sysfs attribute group do this? If it was desired I would
> think someone else would have done this already, or we wouldn't have
> even had an is_visible in the first place as this generic code would
> replace it.
>

I saw this first in PPC cacheinfo. Not sure who else have done that.

>>
>>
>>>> +     case CPU_ONLINE:
>>>> +     case CPU_ONLINE_FROZEN:
>>>> +             rc = detect_cache_attributes(cpu);
>>>> +             if (!rc)
>>>> +                     rc = cache_add_dev(cpu);
>>>> +             break;
>>>> +     case CPU_DEAD:
>>>> +     case CPU_DEAD_FROZEN:
>>>> +             cache_remove_dev(cpu);
>>>> +             if (per_cpu_cacheinfo(cpu))
>>>> +                     free_cache_attributes(cpu);
>>>> +             break;
>>>> +     }
>>>> +     return notifier_from_errno(rc);
>>>> +}
>>>
>>> Hm... adding/detecting/destroying this stuff every time a CPU is
>>> logically hotplugged seems like a waste of time and energy. Why can't we
>>> only do this work when the CPU is actually physically removed? The path
>>> for that is via the subsys_interface and it would make it easier on
>>> programs that want to learn about cache info as long as the CPU is
>>> present in the system even if it isn't online at the time of reading.
>>>
>>
>> I agree, but the main reason I retained it as most of the existing
>> architectures implement this way and I didn't want tho change that
>> behaviour.
>
> Would anything bad happen if we loosened the behavior so that the
> directory is always present as long as the CPU is present? I doubt it.
> Seems like a low risk change.
>

Yes, but before I change, I would like to see people are fine with that.
I don't want to move existing implementations into this generic one and
cause breakage.

Regards,
Sudeep

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index acb9bfc89b48..832b7f2ed6d2 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -224,3 +224,44 @@  Description:	Parameters for the Intel P-state driver
 		frequency range.
 
 		More details can be found in Documentation/cpu-freq/intel-pstate.txt
+
+What:		/sys/devices/system/cpu/cpu*/cache/index*/<set_of_attributes_mentioned_below>
+Date:		July 2014(documented, existed before August 2008)
+Contact:	Sudeep Holla <sudeep.holla@arm.com>
+		Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Parameters for the CPU cache attributes
+
+		attributes:
+			- writethrough: data is written to both the cache line
+					and to the block in the lower-level memory
+			- writeback: data is written only to the cache line and
+				     the modified cache line is written to main
+				     memory only when it is replaced
+			- writeallocate: allocate a memory location to a cache line
+					 on a cache miss because of a write
+			- readallocate: allocate a memory location to a cache line
+					on a cache miss because of a read
+
+		coherency_line_size: the minimum amount of data that gets transferred
+
+		level: the cache hierarcy in the multi-level cache configuration
+
+		number_of_sets: total number of sets in the cache, a set is a
+				collection of cache lines with the same cache index
+
+		physical_line_partition: number of physical cache line per cache tag
+
+		shared_cpu_list: the list of cpus sharing the cache
+
+		shared_cpu_map: logical cpu mask containing the list of cpus sharing
+				the cache
+
+		size: the total cache size in kB
+
+		type:
+			- instruction: cache that only holds instructions
+			- data: cache that only caches data
+			- unified: cache that holds both data and instructions
+
+		ways_of_associativity: degree of freedom in placing a particular block
+					of memory in the cache
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 04b314e0fa51..bad2ff809bec 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -4,7 +4,7 @@  obj-y			:= component.o core.o bus.o dd.o syscore.o \
 			   driver.o class.o platform.o \
 			   cpu.o firmware.o init.o map.o devres.o \
 			   attribute_container.o transport_class.o \
-			   topology.o container.o
+			   topology.o container.o cacheinfo.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y			+= power/
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
new file mode 100644
index 000000000000..983728a919ec
--- /dev/null
+++ b/drivers/base/cacheinfo.c
@@ -0,0 +1,539 @@ 
+/*
+ * cacheinfo support - processor cache information via sysfs
+ *
+ * Based on arch/x86/kernel/cpu/intel_cacheinfo.c
+ * Author: Sudeep Holla <sudeep.holla@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/bitops.h>
+#include <linux/cacheinfo.h>
+#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/sysfs.h>
+
+/* pointer to per cpu cacheinfo */
+static DEFINE_PER_CPU(struct cpu_cacheinfo, ci_cpu_cacheinfo);
+#define ci_cacheinfo(cpu)	(&per_cpu(ci_cpu_cacheinfo, cpu))
+#define cache_leaves(cpu)	(ci_cacheinfo(cpu)->num_leaves)
+#define per_cpu_cacheinfo(cpu)	(ci_cacheinfo(cpu)->info_list)
+
+struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu)
+{
+	return ci_cacheinfo(cpu);
+}
+
+#ifdef CONFIG_OF
+static int cache_setup_of_node(unsigned int cpu)
+{
+	struct device_node *np;
+	struct cacheinfo *this_leaf;
+	struct device *cpu_dev = get_cpu_device(cpu);
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	unsigned int index = 0;
+
+	/* skip if of_node is already populated */
+	if (this_cpu_ci->info_list->of_node)
+		return 0;
+
+	if (!cpu_dev) {
+		pr_err("No cpu device for CPU %d\n", cpu);
+		return -ENODEV;
+	}
+	np = cpu_dev->of_node;
+	if (!np) {
+		pr_err("Failed to find cpu%d device node\n", cpu);
+		return -ENOENT;
+	}
+
+	while (np && index < cache_leaves(cpu)) {
+		this_leaf = this_cpu_ci->info_list + index;
+		if (this_leaf->level != 1)
+			np = of_find_next_cache_node(np);
+		else
+			np = of_node_get(np);/* cpu node itself */
+		this_leaf->of_node = np;
+		index++;
+	}
+	return 0;
+}
+
+static inline bool cache_leaves_are_shared(struct cacheinfo *this_leaf,
+					   struct cacheinfo *sib_leaf)
+{
+	return sib_leaf->of_node == this_leaf->of_node;
+}
+
+static int of_cache_shared_cpu_map_setup(unsigned int cpu)
+{
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf, *sib_leaf;
+	unsigned int index;
+	int ret;
+
+	ret = cache_setup_of_node(cpu);
+	if (ret)
+		return ret;
+
+	for (index = 0; index < cache_leaves(cpu); index++) {
+		unsigned int i;
+
+		this_leaf = this_cpu_ci->info_list + index;
+		cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+
+		for_each_online_cpu(i) {
+			struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+			if (i == cpu || !sib_cpu_ci->info_list)
+				continue;/* skip if itself or no cacheinfo */
+			sib_leaf = sib_cpu_ci->info_list + index;
+			if (cache_leaves_are_shared(this_leaf, sib_leaf)) {
+				cpumask_set_cpu(cpu, &sib_leaf->shared_cpu_map);
+				cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+			}
+		}
+	}
+
+	return 0;
+}
+#else
+static inline int of_cache_shared_cpu_map_setup(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+static void cache_shared_cpu_map_remove(unsigned int cpu)
+{
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf, *sib_leaf;
+	unsigned int sibling, index;
+
+	for (index = 0; index < cache_leaves(cpu); index++) {
+		this_leaf = this_cpu_ci->info_list + index;
+		for_each_cpu(sibling, &this_leaf->shared_cpu_map) {
+			struct cpu_cacheinfo *sib_cpu_ci;
+
+			if (sibling == cpu) /* skip itself */
+				continue;
+			sib_cpu_ci = get_cpu_cacheinfo(sibling);
+			sib_leaf = sib_cpu_ci->info_list + index;
+			cpumask_clear_cpu(cpu, &sib_leaf->shared_cpu_map);
+			cpumask_clear_cpu(sibling, &this_leaf->shared_cpu_map);
+		}
+		of_node_put(this_leaf->of_node);
+	}
+}
+
+static void free_cache_attributes(unsigned int cpu)
+{
+	cache_shared_cpu_map_remove(cpu);
+
+	kfree(per_cpu_cacheinfo(cpu));
+	per_cpu_cacheinfo(cpu) = NULL;
+}
+
+int __weak init_cache_level(unsigned int cpu)
+{
+	return -ENOENT;
+}
+
+int __weak populate_cache_leaves(unsigned int cpu)
+{
+	return -ENOENT;
+}
+
+static int detect_cache_attributes(unsigned int cpu)
+{
+	int ret;
+
+	if (init_cache_level(cpu))
+		return -ENOENT;
+
+	per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu),
+					 sizeof(struct cacheinfo), GFP_KERNEL);
+	if (per_cpu_cacheinfo(cpu) == NULL)
+		return -ENOMEM;
+
+	ret = populate_cache_leaves(cpu);
+	if (ret)
+		goto free_ci;
+	/*
+	 * For systems using DT for cache hierarcy, of_node and shared_cpu_map
+	 * will be set up here. Otherwise populate_cache_leaves needs to set
+	 * shared_cpu_map and next-level-cache should not be specified in DT
+	 */
+	ret = of_cache_shared_cpu_map_setup(cpu);
+	if (ret)
+		goto free_ci;
+	return 0;
+
+free_ci:
+	free_cache_attributes(cpu);
+	return ret;
+}
+
+#ifdef CONFIG_SYSFS
+
+/* pointer to cpuX/cache device */
+static DEFINE_PER_CPU(struct device *, ci_cache_dev);
+#define per_cpu_cache_dev(cpu)	(per_cpu(ci_cache_dev, cpu))
+
+static cpumask_t cache_dev_map;
+
+/* pointer to array of devices for cpuX/cache/indexY */
+static DEFINE_PER_CPU(struct device **, ci_index_dev);
+#define per_cpu_index_dev(cpu)	(per_cpu(ci_index_dev, cpu))
+#define per_cache_index_dev(cpu, idx)	((per_cpu_index_dev(cpu))[idx])
+
+#define show_one(file_name, object)				\
+static ssize_t file_name##_show(struct device *dev,		\
+		struct device_attribute *attr, char *buf)	\
+{								\
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);	\
+	if (!this_leaf->object)					\
+		return -EINVAL;					\
+	return sprintf(buf, "%u\n", this_leaf->object);		\
+}
+
+show_one(level, level);
+show_one(coherency_line_size, coherency_line_size);
+show_one(number_of_sets, number_of_sets);
+show_one(physical_line_partition, physical_line_partition);
+
+static ssize_t ways_of_associativity_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+
+	/* will be zero for fully associative cache, but check for size */
+	if (!this_leaf->size)
+		return -EINVAL;
+	return sprintf(buf, "%u\n", this_leaf->ways_of_associativity);
+}
+
+static ssize_t size_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+
+	if (!this_leaf->size)
+		return -EINVAL;
+	return sprintf(buf, "%uK\n", this_leaf->size >> 10);
+}
+
+static ssize_t shared_cpumap_show_func(struct device *dev, int type, char *buf)
+{
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
+	int n = 0;
+
+	if (len > 1) {
+		const struct cpumask *mask = &this_leaf->shared_cpu_map;
+
+		n = type ? cpulist_scnprintf(buf, len - 2, mask) :
+			   cpumask_scnprintf(buf, len - 2, mask);
+		buf[n++] = '\n';
+		buf[n] = '\0';
+	}
+	return n;
+}
+
+static ssize_t shared_cpu_map_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return shared_cpumap_show_func(dev, 0, buf);
+}
+
+static ssize_t shared_cpu_list_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	return shared_cpumap_show_func(dev, 1, buf);
+}
+
+static ssize_t type_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+
+	switch (this_leaf->type) {
+	case CACHE_TYPE_DATA:
+		return sprintf(buf, "Data\n");
+	case CACHE_TYPE_INST:
+		return sprintf(buf, "Instruction\n");
+	case CACHE_TYPE_UNIFIED:
+		return sprintf(buf, "Unified\n");
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t attributes_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+	unsigned int ci_attr = this_leaf->attributes;
+	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
+	int n = 0;
+
+	if (!ci_attr)
+		return -EINVAL;
+
+	if (ci_attr & CACHE_WRITE_THROUGH)
+		n += snprintf(buf + n, len - n, "WriteThrough\n");
+	if (ci_attr & CACHE_WRITE_BACK)
+		n += snprintf(buf + n, len - n, "WriteBack\n");
+	if (ci_attr & CACHE_READ_ALLOCATE)
+		n += snprintf(buf + n, len - n, "ReadAllocate\n");
+	if (ci_attr & CACHE_WRITE_ALLOCATE)
+		n += snprintf(buf + n, len - n, "WriteAllocate\n");
+	buf[n] = '\0';
+	return n;
+}
+
+static umode_t
+cache_default_attrs_is_visible(struct kobject *kobj,
+			       struct attribute *attr, int unused)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct device_attribute *dev_attr;
+	umode_t mode = attr->mode;
+	char *buf;
+
+	dev_attr = container_of(attr, struct device_attribute, attr);
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return 0;
+
+	/* create attributes that provides meaningful value */
+	if (dev_attr->show && dev_attr->show(dev, dev_attr, buf) < 0)
+		mode = 0;
+
+	kfree(buf);
+	return mode;
+}
+
+static DEVICE_ATTR_RO(level);
+static DEVICE_ATTR_RO(type);
+static DEVICE_ATTR_RO(coherency_line_size);
+static DEVICE_ATTR_RO(ways_of_associativity);
+static DEVICE_ATTR_RO(number_of_sets);
+static DEVICE_ATTR_RO(size);
+static DEVICE_ATTR_RO(attributes);
+static DEVICE_ATTR_RO(shared_cpu_map);
+static DEVICE_ATTR_RO(shared_cpu_list);
+static DEVICE_ATTR_RO(physical_line_partition);
+
+static struct attribute *cache_default_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_level.attr,
+	&dev_attr_shared_cpu_map.attr,
+	&dev_attr_shared_cpu_list.attr,
+	&dev_attr_coherency_line_size.attr,
+	&dev_attr_ways_of_associativity.attr,
+	&dev_attr_number_of_sets.attr,
+	&dev_attr_size.attr,
+	&dev_attr_attributes.attr,
+	&dev_attr_physical_line_partition.attr,
+	NULL
+};
+
+static const struct attribute_group cache_default_group = {
+	.attrs = cache_default_attrs,
+	.is_visible = cache_default_attrs_is_visible,
+};
+
+static const struct attribute_group *cache_default_groups[] = {
+	&cache_default_group,
+	NULL,
+};
+
+static struct attribute_group cache_private_group = {
+	.is_visible = cache_default_attrs_is_visible,
+};
+
+static const struct attribute_group *cache_private_groups[] = {
+	&cache_default_group,
+	(const struct attribute_group *)&cache_private_group,
+	NULL,
+};
+
+const struct attribute **
+__weak cache_get_priv_attr(struct cacheinfo *this_leaf)
+{
+	return NULL;
+}
+
+static const struct attribute_group **
+cache_get_attribute_groups(struct cacheinfo *this_leaf)
+{
+	const struct attribute **priv_attr = cache_get_priv_attr(this_leaf);
+
+	if (!priv_attr)
+		return cache_default_groups;
+
+	if (!cache_private_group.attrs)
+		cache_private_group.attrs = (struct attribute **)priv_attr;
+
+	return cache_private_groups;
+}
+
+/* Add/Remove cache interface for CPU device */
+static void cpu_cache_sysfs_exit(unsigned int cpu)
+{
+	int i;
+	struct device *ci_dev;
+
+	if (per_cpu_index_dev(cpu)) {
+		for (i = 0; i < cache_leaves(cpu); i++) {
+			ci_dev = per_cache_index_dev(cpu, i);
+			if (!ci_dev)
+				continue;
+			device_unregister(ci_dev);
+		}
+		kfree(per_cpu_index_dev(cpu));
+		per_cpu_index_dev(cpu) = NULL;
+	}
+	device_unregister(per_cpu_cache_dev(cpu));
+	per_cpu_cache_dev(cpu) = NULL;
+}
+
+static int cpu_cache_sysfs_init(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+
+	if (per_cpu_cacheinfo(cpu) == NULL)
+		return -ENOENT;
+
+	per_cpu_cache_dev(cpu) = device_create(dev->class, dev, cpu,
+					       NULL, "cache");
+	if (IS_ERR_OR_NULL(per_cpu_cache_dev(cpu)))
+		return PTR_ERR(per_cpu_cache_dev(cpu));
+
+	/* Allocate all required memory */
+	per_cpu_index_dev(cpu) = kcalloc(cache_leaves(cpu),
+					 sizeof(struct device *), GFP_KERNEL);
+	if (unlikely(per_cpu_index_dev(cpu) == NULL))
+		goto err_out;
+
+	return 0;
+
+err_out:
+	cpu_cache_sysfs_exit(cpu);
+	return -ENOMEM;
+}
+
+static int cache_add_dev(unsigned int cpu)
+{
+	unsigned short i;
+	int rc;
+	struct device *ci_dev, *parent;
+	struct cacheinfo *this_leaf;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	const struct attribute_group **cache_groups;
+
+	rc = cpu_cache_sysfs_init(cpu);
+	if (unlikely(rc < 0))
+		return rc;
+
+	parent = per_cpu_cache_dev(cpu);
+	for (i = 0; i < cache_leaves(cpu); i++) {
+		this_leaf = this_cpu_ci->info_list + i;
+		if (this_leaf->disable_sysfs)
+			continue;
+		cache_groups = cache_get_attribute_groups(this_leaf);
+		ci_dev = device_create_with_groups(parent->class, parent, i,
+						   this_leaf, cache_groups,
+						   "index%1u", i);
+		if (IS_ERR_OR_NULL(ci_dev)) {
+			rc = PTR_ERR(ci_dev);
+			goto err;
+		}
+		per_cache_index_dev(cpu, i) = ci_dev;
+	}
+	cpumask_set_cpu(cpu, &cache_dev_map);
+
+	return 0;
+err:
+	cpu_cache_sysfs_exit(cpu);
+	return rc;
+}
+
+static void cache_remove_dev(unsigned int cpu)
+{
+	if (!cpumask_test_cpu(cpu, &cache_dev_map))
+		return;
+	cpumask_clear_cpu(cpu, &cache_dev_map);
+
+	cpu_cache_sysfs_exit(cpu);
+}
+
+static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int rc = 0;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		rc = detect_cache_attributes(cpu);
+		if (!rc)
+			rc = cache_add_dev(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		cache_remove_dev(cpu);
+		if (per_cpu_cacheinfo(cpu))
+			free_cache_attributes(cpu);
+		break;
+	}
+	return notifier_from_errno(rc);
+}
+
+static int __init cacheinfo_sysfs_init(void)
+{
+	int cpu, rc = 0;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		rc = detect_cache_attributes(cpu);
+		if (rc) {
+			pr_err("error detecting cacheinfo..cpu%d\n", cpu);
+			goto out;
+		}
+		rc = cache_add_dev(cpu);
+		if (rc) {
+			free_cache_attributes(cpu);
+			pr_err("error populating cacheinfo..cpu%d\n", cpu);
+			goto out;
+		}
+	}
+	__hotcpu_notifier(cacheinfo_cpu_callback, 0);
+
+out:
+	cpu_notifier_register_done();
+	return rc;
+}
+
+device_initcall(cacheinfo_sysfs_init);
+
+#endif	/* CONFIG_SYSFS */
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
new file mode 100644
index 000000000000..3d67b4910aa8
--- /dev/null
+++ b/include/linux/cacheinfo.h
@@ -0,0 +1,73 @@ 
+#ifndef _LINUX_CACHEINFO_H
+#define _LINUX_CACHEINFO_H
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/sysfs.h>
+
+enum cache_type {
+	CACHE_TYPE_NOCACHE = 0,
+	CACHE_TYPE_INST = BIT(0),
+	CACHE_TYPE_DATA = BIT(1),
+	CACHE_TYPE_SEPARATE = CACHE_TYPE_INST | CACHE_TYPE_DATA,
+	CACHE_TYPE_UNIFIED = BIT(2),
+};
+
+struct cacheinfo {
+	/* core properties */
+	enum cache_type type; /* data, inst or unified */
+	unsigned int level;
+	unsigned int coherency_line_size; /* cache line size  */
+	unsigned int number_of_sets; /* no. of sets per way */
+	unsigned int ways_of_associativity; /* no. of ways */
+	unsigned int physical_line_partition; /* no. of lines per tag */
+	unsigned int size; /* total cache size */
+	cpumask_t shared_cpu_map;
+	unsigned int attributes;
+#define CACHE_WRITE_THROUGH	BIT(0)
+#define CACHE_WRITE_BACK	BIT(1)
+#define CACHE_READ_ALLOCATE	BIT(2)
+#define CACHE_WRITE_ALLOCATE	BIT(3)
+
+	/* book keeping */
+	struct device_node *of_node;	/* cpu if no explicit cache node */
+	bool disable_sysfs; /* don't expose this leaf through sysfs */
+	void *priv;
+};
+
+struct cpu_cacheinfo {
+	struct cacheinfo *info_list;
+	unsigned int num_levels;
+	unsigned int num_leaves;
+};
+
+/*
+ * Helpers to make sure "func" is executed on the cpu whose cache
+ * attributes are being detected
+ */
+#define DEFINE_SMP_CALL_FUNCTION(func)				\
+static void _##func(void *ret)					\
+{								\
+	int cpu = smp_processor_id();				\
+	*(int *)ret = __##func(cpu);				\
+}								\
+								\
+int func(unsigned int cpu)					\
+{								\
+	int ret;						\
+	smp_call_function_single(cpu, _##func, &ret, true);	\
+	return ret;						\
+}
+
+struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
+int init_cache_level(unsigned int cpu);
+int populate_cache_leaves(unsigned int cpu);
+
+#ifdef CONFIG_SYSFS
+const struct attribute **cache_get_priv_attr(struct cacheinfo *this_leaf);
+#endif
+
+#endif /* _LINUX_CACHEINFO_H */