[PATCH v4 3/4] ACPI/MRRM: Add /sys files to describe memory ranges

Tony Luck posted 4 patches 9 months, 2 weeks ago
There is a newer version of this series
[PATCH v4 3/4] ACPI/MRRM: Add /sys files to describe memory ranges
Posted by Tony Luck 9 months, 2 weeks ago
Perf and resctrl users need an enumeration of which memory addresses
are bound to which "region" tag.

Parse the ACPI MRRM table and add /sys entries for each memory range
describing base address, length, NUMA node, and which region tags apply
for same-socket and cross-socket access.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/acpi/acpi_mrrm.c | 143 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c
index ab8022e58da5..f04645a0407f 100644
--- a/drivers/acpi/acpi_mrrm.c
+++ b/drivers/acpi/acpi_mrrm.c
@@ -3,12 +3,16 @@
  * Copyright (c) 2025, Intel Corporation.
  *
  * Memory Range and Region Mapping (MRRM) structure
+ *
+ * Parse and report the platform's MRRM table in /sys.
  */
 
 #define pr_fmt(fmt) "acpi/mrrm: " fmt
 
 #include <linux/acpi.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
 
 static int max_mem_region = -ENOENT;
 
@@ -18,25 +22,162 @@ int acpi_mrrm_max_mem_region(void)
 	return max_mem_region;
 }
 
+struct mrrm_mem_range_entry {
+	u64 base;
+	u64 length;
+	int node;
+	u8  local_region_id;
+	u8  remote_region_id;
+};
+
+static struct mrrm_mem_range_entry *mrrm_mem_range_entry;
+static u32 mrrm_mem_entry_num;
+
+static int get_node_num(struct mrrm_mem_range_entry *e)
+{
+	unsigned int nid;
+
+	for_each_online_node(nid) {
+		for (int z = 0; z < MAX_NR_ZONES; z++) {
+			struct zone *zone = NODE_DATA(nid)->node_zones + z;
+
+			if (!populated_zone(zone))
+				continue;
+			if (zone_intersects(zone, PHYS_PFN(e->base), PHYS_PFN(e->length)))
+				return zone_to_nid(zone);
+		}
+	}
+
+	return -ENOENT;
+}
+
 static __init int acpi_parse_mrrm(struct acpi_table_header *table)
 {
+	struct acpi_mrrm_mem_range_entry *mre_entry;
 	struct acpi_table_mrrm *mrrm;
+	void *mre, *mrrm_end;
+	int mre_count = 0;
 
 	mrrm = (struct acpi_table_mrrm *)table;
 	if (!mrrm)
 		return -ENODEV;
 
+	if (mrrm->flags & ACPI_MRRM_FLAGS_REGION_ASSIGNMENT_OS)
+		return -EOPNOTSUPP;
+
+	mrrm_end = (void *)mrrm + mrrm->header.length - 1;
+	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
+	while (mre < mrrm_end) {
+		mre_entry = mre;
+		mre_count++;
+		mre += mre_entry->header.length;
+	}
+	if (!mre_count) {
+		pr_info(FW_BUG "No ranges listed in MRRM table\n");
+		return -EINVAL;
+	}
+
+	mrrm_mem_range_entry = kmalloc_array(mre_count, sizeof(*mrrm_mem_range_entry),
+					     GFP_KERNEL | __GFP_ZERO);
+	if (!mrrm_mem_range_entry)
+		return -ENOMEM;
+
+	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
+	while (mre < mrrm_end) {
+		struct mrrm_mem_range_entry *e;
+
+		mre_entry = mre;
+		e = mrrm_mem_range_entry + mrrm_mem_entry_num;
+
+		e->base = mre_entry->addr_base;
+		e->length = mre_entry->addr_len;
+		e->node = get_node_num(e);
+
+		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_LOCAL)
+			e->local_region_id = mre_entry->local_region_id;
+		else
+			e->local_region_id = -1;
+		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_REMOTE)
+			e->remote_region_id = mre_entry->remote_region_id;
+		else
+			e->remote_region_id = -1;
+
+		mrrm_mem_entry_num++;
+		mre += mre_entry->header.length;
+	}
+
 	max_mem_region = mrrm->max_mem_region;
 
 	return 0;
 }
 
+#define RANGE_ATTR(name, fmt)						\
+static ssize_t name##_show(struct kobject *kobj,			\
+			  struct kobj_attribute *attr, char *buf)	\
+{									\
+	struct mrrm_mem_range_entry *mre;				\
+	const char *kname = kobject_name(kobj);				\
+	int n, ret;							\
+									\
+	ret = kstrtoint(kname + 5, 10, &n);				\
+	if (ret)							\
+		return ret;						\
+									\
+	mre = mrrm_mem_range_entry + n;					\
+									\
+	return sysfs_emit(buf, fmt, mre->name);				\
+}									\
+static struct kobj_attribute name##_attr = __ATTR_RO(name)
+
+RANGE_ATTR(base, "0x%llx\n");
+RANGE_ATTR(length, "0x%llx\n");
+RANGE_ATTR(node, "%d\n");
+RANGE_ATTR(local_region_id, "%d\n");
+RANGE_ATTR(remote_region_id, "%d\n");
+
+static struct attribute *memory_range_attrs[] = {
+	&base_attr.attr,
+	&length_attr.attr,
+	&node_attr.attr,
+	&local_region_id_attr.attr,
+	&remote_region_id_attr.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(memory_range);
+
+static __init int add_boot_memory_ranges(void)
+{
+	struct kobject *pkobj, *kobj;
+	int ret = -EINVAL;
+	char *name;
+
+	pkobj = kobject_create_and_add("memory_ranges", acpi_kobj);
+
+	for (int i = 0; i < mrrm_mem_entry_num; i++) {
+		name = kasprintf(GFP_KERNEL, "range%d", i);
+		if (!name)
+			break;
+
+		kobj = kobject_create_and_add(name, pkobj);
+
+		ret = sysfs_create_groups(kobj, memory_range_groups);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
 static __init int mrrm_init(void)
 {
 	int ret;
 
 	ret = acpi_table_parse(ACPI_SIG_MRRM, acpi_parse_mrrm);
 
-	return ret;
+	if (ret < 0)
+		return ret;
+
+	return add_boot_memory_ranges();
 }
 device_initcall(mrrm_init);
-- 
2.48.1
Re: [PATCH v4 3/4] ACPI/MRRM: Add /sys files to describe memory ranges
Posted by Fenghua Yu 9 months, 1 week ago
Hi, Tony,

On 4/29/25 13:24, Tony Luck wrote:
> Perf and resctrl users need an enumeration of which memory addresses
> are bound to which "region" tag.
>
> Parse the ACPI MRRM table and add /sys entries for each memory range
> describing base address, length, NUMA node, and which region tags apply
> for same-socket and cross-socket access.
>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>   drivers/acpi/acpi_mrrm.c | 143 ++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 142 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c
> index ab8022e58da5..f04645a0407f 100644
> --- a/drivers/acpi/acpi_mrrm.c
> +++ b/drivers/acpi/acpi_mrrm.c
> @@ -3,12 +3,16 @@
>    * Copyright (c) 2025, Intel Corporation.
>    *
>    * Memory Range and Region Mapping (MRRM) structure
> + *
> + * Parse and report the platform's MRRM table in /sys.
>    */
>   
>   #define pr_fmt(fmt) "acpi/mrrm: " fmt
>   
>   #include <linux/acpi.h>
>   #include <linux/init.h>
> +#include <linux/string.h>
> +#include <linux/sysfs.h>
>   
>   static int max_mem_region = -ENOENT;
>   
> @@ -18,25 +22,162 @@ int acpi_mrrm_max_mem_region(void)
>   	return max_mem_region;
>   }
>   
> +struct mrrm_mem_range_entry {
> +	u64 base;
> +	u64 length;
> +	int node;
> +	u8  local_region_id;
> +	u8  remote_region_id;
> +};
> +
> +static struct mrrm_mem_range_entry *mrrm_mem_range_entry;
> +static u32 mrrm_mem_entry_num;
> +
> +static int get_node_num(struct mrrm_mem_range_entry *e)
> +{
> +	unsigned int nid;
> +
> +	for_each_online_node(nid) {
> +		for (int z = 0; z < MAX_NR_ZONES; z++) {
> +			struct zone *zone = NODE_DATA(nid)->node_zones + z;
> +
> +			if (!populated_zone(zone))
> +				continue;
> +			if (zone_intersects(zone, PHYS_PFN(e->base), PHYS_PFN(e->length)))
> +				return zone_to_nid(zone);
> +		}
> +	}
> +
> +	return -ENOENT;
> +}
> +
>   static __init int acpi_parse_mrrm(struct acpi_table_header *table)
>   {
> +	struct acpi_mrrm_mem_range_entry *mre_entry;
>   	struct acpi_table_mrrm *mrrm;
> +	void *mre, *mrrm_end;
> +	int mre_count = 0;
>   
>   	mrrm = (struct acpi_table_mrrm *)table;
>   	if (!mrrm)
>   		return -ENODEV;
>   
> +	if (mrrm->flags & ACPI_MRRM_FLAGS_REGION_ASSIGNMENT_OS)
> +		return -EOPNOTSUPP;
> +
> +	mrrm_end = (void *)mrrm + mrrm->header.length - 1;
> +	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
> +	while (mre < mrrm_end) {
> +		mre_entry = mre;
> +		mre_count++;
> +		mre += mre_entry->header.length;
> +	}
> +	if (!mre_count) {
> +		pr_info(FW_BUG "No ranges listed in MRRM table\n");
> +	 	return -EINVAL;
> +	}
> +
> +	mrrm_mem_range_entry = kmalloc_array(mre_count, sizeof(*mrrm_mem_range_entry),
> +					     GFP_KERNEL | __GFP_ZERO);
> +	if (!mrrm_mem_range_entry)
> +		return -ENOMEM;
> +
> +	mre = (void *)mrrm + sizeof(struct acpi_table_mrrm);
> +	while (mre < mrrm_end) {
> +		struct mrrm_mem_range_entry *e;
> +
> +		mre_entry = mre;
> +		e = mrrm_mem_range_entry + mrrm_mem_entry_num;
> +
> +		e->base = mre_entry->addr_base;
> +		e->length = mre_entry->addr_len;
> +		e->node = get_node_num(e);
> +
> +		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_LOCAL)
> +			e->local_region_id = mre_entry->local_region_id;
> +		else
> +			e->local_region_id = -1;
> +		if (mre_entry->region_id_flags & ACPI_MRRM_VALID_REGION_ID_FLAGS_REMOTE)
> +			e->remote_region_id = mre_entry->remote_region_id;
> +		else
> +			e->remote_region_id = -1;
> +
> +		mrrm_mem_entry_num++;
> +		mre += mre_entry->header.length;
> +	}
> +
>   	max_mem_region = mrrm->max_mem_region;
>   
>   	return 0;
>   }
>   
> +#define RANGE_ATTR(name, fmt)						\
> +static ssize_t name##_show(struct kobject *kobj,			\

"name" is used as a macro parameter. But "name" is also used as a 
variable mre->name in the macro. checkpatch complains this kind of usage.

Maybe change the parameter "name" as something like "range_name" to 
avoid the potential confusion?

> +			  struct kobj_attribute *attr, char *buf)	\
> +{									\
> +	struct mrrm_mem_range_entry *mre;				\
> +	const char *kname = kobject_name(kobj);				\
> +	int n, ret;							\
> +									\
> +	ret = kstrtoint(kname + 5, 10, &n);				\
> +	if (ret)							\
> +		return ret;						\
> +									\
> +	mre = mrrm_mem_range_entry + n;					\
> +									\
> +	return sysfs_emit(buf, fmt, mre->name);				\
> +}									\
> +static struct kobj_attribute name##_attr = __ATTR_RO(name)
> +
> +RANGE_ATTR(base, "0x%llx\n");
> +RANGE_ATTR(length, "0x%llx\n");
> +RANGE_ATTR(node, "%d\n");
> +RANGE_ATTR(local_region_id, "%d\n");
> +RANGE_ATTR(remote_region_id, "%d\n");
> +
> +static struct attribute *memory_range_attrs[] = {
> +	&base_attr.attr,
> +	&length_attr.attr,
> +	&node_attr.attr,
> +	&local_region_id_attr.attr,
> +	&remote_region_id_attr.attr,
> +	NULL
> +};
> +
> +ATTRIBUTE_GROUPS(memory_range);
> +
> +static __init int add_boot_memory_ranges(void)
> +{
> +	struct kobject *pkobj, *kobj;
> +	int ret = -EINVAL;
> +	char *name;
> +
> +	pkobj = kobject_create_and_add("memory_ranges", acpi_kobj);
> +
> +	for (int i = 0; i < mrrm_mem_entry_num; i++) {
> +		name = kasprintf(GFP_KERNEL, "range%d", i);
> +		if (!name)
> +			break;
> +
> +		kobj = kobject_create_and_add(name, pkobj);
> +
> +		ret = sysfs_create_groups(kobj, memory_range_groups);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return ret;
> +}
> +
>   static __init int mrrm_init(void)
>   {
>   	int ret;
>   
>   	ret = acpi_table_parse(ACPI_SIG_MRRM, acpi_parse_mrrm);
>   
This blank line seems redundant. Maybe remove it so that the "if (ret < 
0)" sentence follows the "ret = ...." sentence immediately?
> -	return ret;
> +	if (ret < 0)
> +		return ret;
> +
> +	return add_boot_memory_ranges();
>   }
>   device_initcall(mrrm_init);

Thanks.

-Fenghua
Re: [PATCH v4 3/4] ACPI/MRRM: Add /sys files to describe memory ranges
Posted by Luck, Tony 9 months, 1 week ago
On Sun, May 04, 2025 at 11:23:50PM -0700, Fenghua Yu wrote:
> Hi, Tony,
> > +#define RANGE_ATTR(name, fmt)						\
> > +static ssize_t name##_show(struct kobject *kobj,			\
> 
> "name" is used as a macro parameter. But "name" is also used as a variable
> mre->name in the macro. checkpatch complains this kind of usage.

The checkpatch complaint is that is is used twice. Once as "mre->name"
(as you noted), but also as "__ATTR_RO(name)" two lines after.

Checkpatch is worried that the macor might be invoked with an argument
that has side effects ("foo++", or return from function call "baz()")
which would result in the side-effects happening twice.

It's a false positive in this case because:

1) This macro is only used with a simple argument (and can only ever be
used in that way.
2) It's used for a static initialization of compile time, so a 2nd
reason why side-effects from an argment are not possible.

> Maybe change the parameter "name" as something like "range_name" to avoid
> the potential confusion?
> 
> > +			  struct kobj_attribute *attr, char *buf)	\
> > +{									\
> > +	struct mrrm_mem_range_entry *mre;				\
> > +	const char *kname = kobject_name(kobj);				\
> > +	int n, ret;							\
> > +									\
> > +	ret = kstrtoint(kname + 5, 10, &n);				\
> > +	if (ret)							\
> > +		return ret;						\
> > +									\
> > +	mre = mrrm_mem_range_entry + n;					\
> > +									\
> > +	return sysfs_emit(buf, fmt, mre->name);				\
> > +}									\
> > +static struct kobj_attribute name##_attr = __ATTR_RO(name)
> > +
> > +RANGE_ATTR(base, "0x%llx\n");
> > +RANGE_ATTR(length, "0x%llx\n");
> > +RANGE_ATTR(node, "%d\n");
> > +RANGE_ATTR(local_region_id, "%d\n");
> > +RANGE_ATTR(remote_region_id, "%d\n");

...

> > +
> >   static __init int mrrm_init(void)
> >   {
> >   	int ret;
> >   	ret = acpi_table_parse(ACPI_SIG_MRRM, acpi_parse_mrrm);
> This blank line seems redundant. Maybe remove it so that the "if (ret < 0)"
> sentence follows the "ret = ...." sentence immediately?

Agreed. I will delete in next version.

> > -	return ret;
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	return add_boot_memory_ranges();
> >   }
> >   device_initcall(mrrm_init);
> 
> Thanks.
> 
> -Fenghua

Thanks for the review.

-Tony
>