mbox series

[v4,0/4] Add interfaces for ACPI MRRM table

Message ID 20250429202412.380637-1-tony.luck@intel.com
Headers show
Series Add interfaces for ACPI MRRM table | expand

Message

Luck, Tony April 29, 2025, 8:24 p.m. UTC
Memory used to be homogeneous. Then NUMA came along. Later different
types of memory (persistent memory, on-package high bandwidth memory,
CXL attached memory).

Each type of memory has its own performance characteristics, and users
will need to monitor and control access by type.

The MRRM solution is to tag physical address ranges with "region IDs"
so that platform firmware[1] can indicate the type of memory for each
range (with separate tags available for local vs. remote access to
each range). Note that these ranges can include addresses reserved
for future hotplugged memory.

The region IDs will be used to provide separate event counts for each
region for "perf" and for the "resctrl" file system to monitor and
control memory bandwidth in each region.

Users will need to know the address range(s) that are part of each
region. This patch series adds
	/sys/firmware/acpi/memory_ranges/rangeX
directories to provide user space accessible enumeration.

-Tony

[1] MRRM definition allow for future expansion for the OS to assign
these region IDs.

Changes since version 3 here:
https://lore.kernel.org/all/20250410223207.257722-1-tony.luck@intel.com/

1) Rebase to v6.15-rc4
2) Removed ugly #ifdef in acpi_mrrm.c with better fix for CONFIG_NUMA=n
3) Moved stub for acpi_mrrm_max_mem_region() into #else !ACPI section
   of <linux/acpi.h>

Tony Luck (4):
  ACPICA: Define MRRM ACPI table
  ACPI/MRRM: Minimal parse of ACPI MRRM table
  ACPI/MRRM: Add /sys files to describe memory ranges
  ACPI: Add documentation for exposing MRRM data

 include/linux/acpi.h                          |   9 +
 include/acpi/actbl1.h                         |   7 +
 include/acpi/actbl2.h                         |  42 ++++
 drivers/acpi/acpi_mrrm.c                      | 183 ++++++++++++++++++
 Documentation/ABI/testing/sysfs-firmware-acpi |  21 ++
 arch/x86/Kconfig                              |   1 +
 drivers/acpi/Kconfig                          |   3 +
 drivers/acpi/Makefile                         |   1 +
 8 files changed, 267 insertions(+)
 create mode 100644 drivers/acpi/acpi_mrrm.c


base-commit: b4432656b36e5cc1d50a1f2dc15357543add530e

Comments

Fenghua Yu May 5, 2025, 6:23 a.m. UTC | #1
Hi, Tony,

On 4/29/25 13:24, Tony Luck wrote:
> Perf and resctrl users need an enumeration of which memory addresses
> are bound to which "region" tag.
>
> Parse the ACPI MRRM table and add /sys entries for each memory range
> describing base address, length, NUMA node, and which region tags apply
> for same-socket and cross-socket access.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>   drivers/acpi/acpi_mrrm.c | 143 ++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 142 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c
> index ab8022e58da5..f04645a0407f 100644
> --- a/drivers/acpi/acpi_mrrm.c
> +++ b/drivers/acpi/acpi_mrrm.c
> @@ -3,12 +3,16 @@
>    * Copyright (c) 2025, Intel Corporation.
>    *
>    * Memory Range and Region Mapping (MRRM) structure
> + *
> + * Parse and report the platform's MRRM table in /sys.
>    */
>   
>   #define pr_fmt(fmt) "acpi/mrrm: " fmt
>   
>   #include <linux/acpi.h>
>   #include <linux/init.h>
> +#include <linux/string.h>
> +#include <linux/sysfs.h>
>   
>   static int max_mem_region = -ENOENT;
>   
> @@ -18,25 +22,162 @@ int acpi_mrrm_max_mem_region(void)
>   	return max_mem_region;
>   }
>   
> +struct mrrm_mem_range_entry {
> +	u64 base;
> +	u64 length;
> +	int node;
> +	u8  local_region_id;
> +	u8  remote_region_id;
> +};
> +
> +static struct mrrm_mem_range_entry *mrrm_mem_range_entry;
> +static u32 mrrm_mem_entry_num;
> +
> +static int get_node_num(struct mrrm_mem_range_entry *e)
> +{
> +	unsigned int nid;
> +
> +	for_each_online_node(nid) {
> +		for (int z = 0; z < MAX_NR_ZONES; z++) {
> +			struct zone *zone = NODE_DATA(nid)->node_zones + z;
> +
> +			if (!populated_zone(zone))
> +				continue;
> +			if (zone_intersects(zone, PHYS_PFN(e->base), PHYS_PFN(e->length)))
> +				return zone_to_nid(zone);
> +		}
> +	}
> +
> +	return -ENOENT;
> +}
> +
>   static __init int acpi_parse_mrrm(struct acpi_table_header *table)
>   {
> +	struct acpi_mrrm_mem_range_entry *mre_entry;
>   	struct acpi_table_mrrm *mrrm;
> +	void *mre, *mrrm_end;
> +	int mre_count = 0;
>   
>   	mrrm = (struct acpi_table_mrrm *)table;
>   	if (!mrrm)
>   		return -ENODEV;
>   
> +	if (mrrm->flags & ACPI_MRRM_FLAGS_REGION_ASSIGNMENT_OS)
> +		return -EOPNOTSUPP;
> +
> +	mrrm_end = (void *)mrrm + mrrm->header.length - 1;
> +	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
> +	while (mre < mrrm_end) {
> +		mre_entry = mre;
> +		mre_count++;
> +		mre += mre_entry->header.length;
> +	}
> +	if (!mre_count) {
> +		pr_info(FW_BUG "No ranges listed in MRRM table\n");
> +	 	return -EINVAL;
> +	}
> +
> +	mrrm_mem_range_entry = kmalloc_array(mre_count, sizeof(*mrrm_mem_range_entry),
> +					     GFP_KERNEL | __GFP_ZERO);
> +	if (!mrrm_mem_range_entry)
> +		return -ENOMEM;
> +
> +	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
> +	while (mre < mrrm_end) {
> +		struct mrrm_mem_range_entry *e;
> +
> +		mre_entry = mre;
> +		e = mrrm_mem_range_entry + mrrm_mem_entry_num;
> +
> +		e->base = mre_entry->addr_base;
> +		e->length = mre_entry->addr_len;
> +		e->node = get_node_num(e);
> +
> +		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_LOCAL)
> +			e->local_region_id = mre_entry->local_region_id;
> +		else
> +			e->local_region_id = -1;
> +		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_REMOTE)
> +			e->remote_region_id = mre_entry->remote_region_id;
> +		else
> +			e->remote_region_id = -1;
> +
> +		mrrm_mem_entry_num++;
> +		mre += mre_entry->header.length;
> +	}
> +
>   	max_mem_region = mrrm->max_mem_region;
>   
>   	return 0;
>   }
>   
> +#define RANGE_ATTR(name, fmt)						\
> +static ssize_t name##_show(struct kobject *kobj,			\

"name" is used as a macro parameter. But "name" is also used as a 
variable mre->name in the macro. checkpatch complains this kind of usage.

Maybe change the parameter "name" as something like "range_name" to 
avoid the potential confusion?

> +			  struct kobj_attribute *attr, char *buf)	\
> +{									\
> +	struct mrrm_mem_range_entry *mre;				\
> +	const char *kname = kobject_name(kobj);				\
> +	int n, ret;							\
> +									\
> +	ret = kstrtoint(kname + 5, 10, &n);				\
> +	if (ret)							\
> +		return ret;						\
> +									\
> +	mre = mrrm_mem_range_entry + n;					\
> +									\
> +	return sysfs_emit(buf, fmt, mre->name);				\
> +}									\
> +static struct kobj_attribute name##_attr = __ATTR_RO(name)
> +
> +RANGE_ATTR(base, "0x%llx\n");
> +RANGE_ATTR(length, "0x%llx\n");
> +RANGE_ATTR(node, "%d\n");
> +RANGE_ATTR(local_region_id, "%d\n");
> +RANGE_ATTR(remote_region_id, "%d\n");
> +
> +static struct attribute *memory_range_attrs[] = {
> +	&base_attr.attr,
> +	&length_attr.attr,
> +	&node_attr.attr,
> +	&local_region_id_attr.attr,
> +	&remote_region_id_attr.attr,
> +	NULL
> +};
> +
> +ATTRIBUTE_GROUPS(memory_range);
> +
> +static __init int add_boot_memory_ranges(void)
> +{
> +	struct kobject *pkobj, *kobj;
> +	int ret = -EINVAL;
> +	char *name;
> +
> +	pkobj = kobject_create_and_add("memory_ranges", acpi_kobj);
> +
> +	for (int i = 0; i < mrrm_mem_entry_num; i++) {
> +		name = kasprintf(GFP_KERNEL, "range%d", i);
> +		if (!name)
> +			break;
> +
> +		kobj = kobject_create_and_add(name, pkobj);
> +
> +		ret = sysfs_create_groups(kobj, memory_range_groups);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return ret;
> +}
> +
>   static __init int mrrm_init(void)
>   {
>   	int ret;
>   
>   	ret = acpi_table_parse(ACPI_SIG_MRRM, acpi_parse_mrrm);
>   
This blank line seems redundant. Maybe remove it so that the "if (ret < 
0)" sentence follows the "ret = ...." sentence immediately?
> -	return ret;
> +	if (ret < 0)
> +		return ret;
> +
> +	return add_boot_memory_ranges();
>   }
>   device_initcall(mrrm_init);

Thanks.

-Fenghua
Luck, Tony May 5, 2025, 4:28 p.m. UTC | #2
On Sun, May 04, 2025 at 11:23:50PM -0700, Fenghua Yu wrote:
> Hi, Tony,
> > +#define RANGE_ATTR(name, fmt)						\
> > +static ssize_t name##_show(struct kobject *kobj,			\
> 
> "name" is used as a macro parameter. But "name" is also used as a variable
> mre->name in the macro. checkpatch complains this kind of usage.

The checkpatch complaint is that is is used twice. Once as "mre->name"
(as you noted), but also as "__ATTR_RO(name)" two lines after.

Checkpatch is worried that the macor might be invoked with an argument
that has side effects ("foo++", or return from function call "baz()")
which would result in the side-effects happening twice.

It's a false positive in this case because:

1) This macro is only used with a simple argument (and can only ever be
used in that way.
2) It's used for a static initialization of compile time, so a 2nd
reason why side-effects from an argment are not possible.

> Maybe change the parameter "name" as something like "range_name" to avoid
> the potential confusion?
> 
> > +			  struct kobj_attribute *attr, char *buf)	\
> > +{									\
> > +	struct mrrm_mem_range_entry *mre;				\
> > +	const char *kname = kobject_name(kobj);				\
> > +	int n, ret;							\
> > +									\
> > +	ret = kstrtoint(kname + 5, 10, &n);				\
> > +	if (ret)							\
> > +		return ret;						\
> > +									\
> > +	mre = mrrm_mem_range_entry + n;					\
> > +									\
> > +	return sysfs_emit(buf, fmt, mre->name);				\
> > +}									\
> > +static struct kobj_attribute name##_attr = __ATTR_RO(name)
> > +
> > +RANGE_ATTR(base, "0x%llx\n");
> > +RANGE_ATTR(length, "0x%llx\n");
> > +RANGE_ATTR(node, "%d\n");
> > +RANGE_ATTR(local_region_id, "%d\n");
> > +RANGE_ATTR(remote_region_id, "%d\n");

...

> > +
> >   static __init int mrrm_init(void)
> >   {
> >   	int ret;
> >   	ret = acpi_table_parse(ACPI_SIG_MRRM, acpi_parse_mrrm);
> This blank line seems redundant. Maybe remove it so that the "if (ret < 0)"
> sentence follows the "ret = ...." sentence immediately?

Agreed. I will delete in next version.

> > -	return ret;
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	return add_boot_memory_ranges();
> >   }
> >   device_initcall(mrrm_init);
> 
> Thanks.
> 
> -Fenghua

Thanks for the review.

-Tony
>