Add code to parse the arm64 specific MPAM table, looking up the cache
level from the PPTT and feeding the end result into the MPAM driver.
For now the MPAM hook mpam_ris_create() is stubbed out, but will update
the MPAM driver with optional discovered data.
CC: Carl Worth <carl@os.amperecomputing.com>
Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en
Signed-off-by: James Morse <james.morse@arm.com>
---
Changes since v1:
* Whitespace.
* Gave GLOBAL_AFFINITY a pre-processor'd name.
* Fixed assumption that there are zero functional dependencies.
* Bounds check walking of the MSC RIS.
* More bounds checking in the main table walk.
* Check for nonsense numbers of function dependencies.
* Smattering of pr_debug() to help folk feeding line-noise to the parser.
* Changed the comment flavour on the SPDX string.
* Removed additional table check.
* More comment wrangling.
Changes since RFC:
* Used DEFINE_RES_IRQ_NAMED() and friends macros.
* Additional error handling.
* Check for zero sized MSC.
* Allow table revisions greater than 1. (no spec for revision 0!)
* Use cleanup helpers to retrive ACPI tables, which allows some functions
to be folded together.
---
arch/arm64/Kconfig | 1 +
drivers/acpi/arm64/Kconfig | 3 +
drivers/acpi/arm64/Makefile | 1 +
drivers/acpi/arm64/mpam.c | 361 ++++++++++++++++++++++++++++++++++++
drivers/acpi/tables.c | 2 +-
include/linux/acpi.h | 12 ++
include/linux/arm_mpam.h | 48 +++++
7 files changed, 427 insertions(+), 1 deletion(-)
create mode 100644 drivers/acpi/arm64/mpam.c
create mode 100644 include/linux/arm_mpam.h
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4be8a13505bf..6487c511bdc6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE
config ARM64_MPAM
bool "Enable support for MPAM"
+ select ACPI_MPAM if ACPI
help
Memory System Resource Partitioning and Monitoring (MPAM) is an
optional extension to the Arm architecture that allows each
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index b3ed6212244c..f2fd79f22e7d 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -21,3 +21,6 @@ config ACPI_AGDI
config ACPI_APMT
bool
+
+config ACPI_MPAM
+ bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 05ecde9eaabe..9390b57cb564 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o
obj-$(CONFIG_ACPI_FFH) += ffh.o
obj-$(CONFIG_ACPI_GTDT) += gtdt.o
obj-$(CONFIG_ACPI_IORT) += iort.o
+obj-$(CONFIG_ACPI_MPAM) += mpam.o
obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o
obj-$(CONFIG_ARM_AMBA) += amba.o
obj-y += dma.o init.o
diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c
new file mode 100644
index 000000000000..fd9cfa143676
--- /dev/null
+++ b/drivers/acpi/arm64/mpam.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */
+
+#define pr_fmt(fmt) "ACPI MPAM: " fmt
+
+#include <linux/acpi.h>
+#include <linux/arm_mpam.h>
+#include <linux/bits.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/platform_device.h>
+
+#include <acpi/processor.h>
+
+/*
+ * Flags for acpi_table_mpam_msc.*_interrupt_flags.
+ * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet.
+ */
+#define ACPI_MPAM_MSC_IRQ_MODE_MASK BIT(0)
+#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1)
+#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER BIT(3)
+#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4)
+
+static bool acpi_mpam_register_irq(struct platform_device *pdev, int intid,
+ u32 flags, int *irq,
+ u32 processor_container_uid)
+{
+ int sense;
+
+ if (!intid)
+ return false;
+
+ if (FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags) !=
+ ACPI_MPAM_MSC_IRQ_TYPE_WIRED)
+ return false;
+
+ sense = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE_MASK, flags);
+
+ if (16 <= intid && intid < 32 && processor_container_uid != GLOBAL_AFFINITY) {
+ pr_err_once("Partitioned interrupts not supported\n");
+ return false;
+ }
+
+ *irq = acpi_register_gsi(&pdev->dev, intid, sense, ACPI_ACTIVE_HIGH);
+ if (*irq <= 0) {
+ pr_err_once("Failed to register interrupt 0x%x with ACPI\n",
+ intid);
+ return false;
+ }
+
+ return true;
+}
+
+static void acpi_mpam_parse_irqs(struct platform_device *pdev,
+ struct acpi_mpam_msc_node *tbl_msc,
+ struct resource *res, int *res_idx)
+{
+ u32 flags, aff;
+ int irq;
+
+ flags = tbl_msc->overflow_interrupt_flags;
+ if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID &&
+ flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER)
+ aff = tbl_msc->overflow_interrupt_affinity;
+ else
+ aff = GLOBAL_AFFINITY;
+ if (acpi_mpam_register_irq(pdev, tbl_msc->overflow_interrupt, flags, &irq, aff))
+ res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow");
+
+ flags = tbl_msc->error_interrupt_flags;
+ if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID &&
+ flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER)
+ aff = tbl_msc->error_interrupt_affinity;
+ else
+ aff = GLOBAL_AFFINITY;
+ if (acpi_mpam_register_irq(pdev, tbl_msc->error_interrupt, flags, &irq, aff))
+ res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error");
+}
+
+static int acpi_mpam_parse_resource(struct mpam_msc *msc,
+ struct acpi_mpam_resource_node *res)
+{
+ int level, nid;
+ u32 cache_id;
+
+ switch (res->locator_type) {
+ case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE:
+ cache_id = res->locator.cache_locator.cache_reference;
+ level = find_acpi_cache_level_from_id(cache_id);
+ if (level <= 0) {
+ pr_err_once("Bad level (%u) for cache with id %u\n", level, cache_id);
+ return -EINVAL;
+ }
+ return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE,
+ level, cache_id);
+ case ACPI_MPAM_LOCATION_TYPE_MEMORY:
+ nid = pxm_to_node(res->locator.memory_locator.proximity_domain);
+ if (nid == NUMA_NO_NODE)
+ nid = 0;
+ return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY,
+ 255, nid);
+ default:
+ /* These get discovered later and treated as unknown */
+ return 0;
+ }
+}
+
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+ struct acpi_mpam_msc_node *tbl_msc)
+{
+ int i, err;
+ char *ptr, *table_end;
+ struct acpi_mpam_resource_node *resource;
+
+ ptr = (char *)(tbl_msc + 1);
+ table_end = ptr + tbl_msc->length;
+ for (i = 0; i < tbl_msc->num_resource_nodes; i++) {
+ u64 max_deps, remaining_table;
+
+ if (ptr + sizeof(*resource) > table_end)
+ return -EINVAL;
+
+ resource = (struct acpi_mpam_resource_node *)ptr;
+
+ remaining_table = table_end - ptr;
+ max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps);
+ if (resource->num_functional_deps > max_deps) {
+ pr_debug("MSC has impossible number of functional dependencies\n");
+ return -EINVAL;
+ }
+
+ err = acpi_mpam_parse_resource(msc, resource);
+ if (err)
+ return err;
+
+ ptr += sizeof(*resource);
+ ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps);
+ }
+
+ return 0;
+}
+
+static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc,
+ struct platform_device *pdev,
+ u32 *acpi_id)
+{
+ char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1];
+ bool acpi_id_valid = false;
+ struct acpi_device *buddy;
+ char uid[11];
+ int err;
+
+ memset(&hid, 0, sizeof(hid));
+ memcpy(hid, &tbl_msc->hardware_id_linked_device,
+ sizeof(tbl_msc->hardware_id_linked_device));
+
+ if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) {
+ *acpi_id = tbl_msc->instance_id_linked_device;
+ acpi_id_valid = true;
+ }
+
+ err = snprintf(uid, sizeof(uid), "%u",
+ tbl_msc->instance_id_linked_device);
+ if (err >= sizeof(uid)) {
+ pr_debug("Failed to convert uid of device for power management.");
+ return acpi_id_valid;
+ }
+
+ buddy = acpi_dev_get_first_match_dev(hid, uid, -1);
+ if (buddy)
+ device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS);
+
+ return acpi_id_valid;
+}
+
+static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc,
+ enum mpam_msc_iface *iface)
+{
+ switch (tbl_msc->interface_type) {
+ case 0:
+ *iface = MPAM_IFACE_MMIO;
+ return 0;
+ case 0xa:
+ *iface = MPAM_IFACE_PCC;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int __init acpi_mpam_parse(void)
+{
+ struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0);
+ char *table_end, *table_offset = (char *)(table + 1);
+ struct property_entry props[4]; /* needs a sentinel */
+ struct acpi_mpam_msc_node *tbl_msc;
+ int next_res, next_prop, err = 0;
+ struct acpi_device *companion;
+ struct platform_device *pdev;
+ enum mpam_msc_iface iface;
+ struct resource res[3];
+ char uid[16];
+ u32 acpi_id;
+
+ if (acpi_disabled || !system_supports_mpam() || IS_ERR(table))
+ return 0;
+
+ if (table->revision < 1)
+ return 0;
+
+ table_end = (char *)table + table->length;
+
+ while (table_offset < table_end) {
+ tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+ table_offset += tbl_msc->length;
+
+ if (table_offset > table_end) {
+ pr_debug("MSC entry overlaps end of ACPI table\n");
+ break;
+ }
+
+ /*
+ * If any of the reserved fields are set, make no attempt to
+ * parse the MSC structure. This MSC will still be counted,
+ * meaning the MPAM driver can't probe against all MSC, and
+ * will never be enabled. There is no way to enable it safely,
+ * because we cannot determine safe system-wide partid and pmg
+ * ranges in this situation.
+ */
+ if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) {
+ pr_err_once("Unrecognised MSC, MPAM not usable\n");
+ pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier);
+ continue;
+ }
+
+ if (!tbl_msc->mmio_size) {
+ pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier);
+ continue;
+ }
+
+ if (decode_interface_type(tbl_msc, &iface)) {
+ pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier);
+ continue;
+ }
+
+ next_res = 0;
+ next_prop = 0;
+ memset(res, 0, sizeof(res));
+ memset(props, 0, sizeof(props));
+
+ pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier);
+ if (!pdev) {
+ err = -ENOMEM;
+ break;
+ }
+
+ if (tbl_msc->length < sizeof(*tbl_msc)) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Some power management is described in the namespace: */
+ err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier);
+ if (err > 0 && err < sizeof(uid)) {
+ companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1);
+ if (companion)
+ ACPI_COMPANION_SET(&pdev->dev, companion);
+ else
+ pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier);
+ }
+
+ if (iface == MPAM_IFACE_MMIO) {
+ res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address,
+ tbl_msc->mmio_size,
+ "MPAM:MSC");
+ } else if (iface == MPAM_IFACE_PCC) {
+ props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel",
+ tbl_msc->base_address);
+ next_prop++;
+ }
+
+ acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res);
+ err = platform_device_add_resources(pdev, res, next_res);
+ if (err)
+ break;
+
+ props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us",
+ tbl_msc->max_nrdy_usec);
+
+ /*
+ * The MSC's CPU affinity is described via its linked power
+ * management device, but only if it points at a Processor or
+ * Processor Container.
+ */
+ if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) {
+ props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity",
+ acpi_id);
+ }
+
+ err = device_create_managed_software_node(&pdev->dev, props,
+ NULL);
+ if (err)
+ break;
+
+ /* Come back later if you want the RIS too */
+ err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length);
+ if (err)
+ break;
+
+ err = platform_device_add(pdev);
+ if (err)
+ break;
+ }
+
+ if (err)
+ platform_device_put(pdev);
+
+ return err;
+}
+
+int acpi_mpam_count_msc(void)
+{
+ struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0);
+ char *table_end, *table_offset = (char *)(table + 1);
+ struct acpi_mpam_msc_node *tbl_msc;
+ int count = 0;
+
+ if (IS_ERR(table))
+ return 0;
+
+ if (table->revision < 1)
+ return 0;
+
+ table_end = (char *)table + table->length;
+
+ while (table_offset < table_end) {
+ tbl_msc = (struct acpi_mpam_msc_node *)table_offset;
+ if (!tbl_msc->mmio_size)
+ continue;
+
+ if (tbl_msc->length < sizeof(*tbl_msc))
+ return -EINVAL;
+ if (tbl_msc->length > table_end - table_offset)
+ return -EINVAL;
+ table_offset += tbl_msc->length;
+
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * Call after ACPI devices have been created, which happens behind acpi_scan_init()
+ * called from subsys_initcall(). PCC requires the mailbox driver, which is
+ * initialised from postcore_initcall().
+ */
+subsys_initcall_sync(acpi_mpam_parse);
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index fa9bb8c8ce95..835e3795ede3 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst
ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT,
ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT,
ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI,
- ACPI_SIG_NBFT };
+ ACPI_SIG_NBFT, ACPI_SIG_MPAM };
#define ACPI_HEADER_SIZE sizeof(struct acpi_table_header)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c5fd92cda487..af449964426b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -8,6 +8,7 @@
#ifndef _LINUX_ACPI_H
#define _LINUX_ACPI_H
+#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/ioport.h> /* for struct resource */
#include <linux/resource_ext.h>
@@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void);
void acpi_table_init_complete (void);
int acpi_table_init (void);
+static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance)
+{
+ struct acpi_table_header *table;
+ int status = acpi_get_table(signature, instance, &table);
+
+ if (ACPI_FAILURE(status))
+ return ERR_PTR(-ENOENT);
+ return table;
+}
+DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T))
+
int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
int __init_or_acpilib acpi_table_parse_entries(char *id,
unsigned long table_size, int entry_id,
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
new file mode 100644
index 000000000000..3d6c39c667c3
--- /dev/null
+++ b/include/linux/arm_mpam.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __LINUX_ARM_MPAM_H
+#define __LINUX_ARM_MPAM_H
+
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+#define GLOBAL_AFFINITY ~0
+
+struct mpam_msc;
+
+enum mpam_msc_iface {
+ MPAM_IFACE_MMIO, /* a real MPAM MSC */
+ MPAM_IFACE_PCC, /* a fake MPAM MSC */
+};
+
+enum mpam_class_types {
+ MPAM_CLASS_CACHE, /* Well known caches, e.g. L2 */
+ MPAM_CLASS_MEMORY, /* Main memory */
+ MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */
+};
+
+#ifdef CONFIG_ACPI_MPAM
+/* Parse the ACPI description of resources entries for this MSC. */
+int acpi_mpam_parse_resources(struct mpam_msc *msc,
+ struct acpi_mpam_msc_node *tbl_msc);
+
+int acpi_mpam_count_msc(void);
+#else
+static inline int acpi_mpam_parse_resources(struct mpam_msc *msc,
+ struct acpi_mpam_msc_node *tbl_msc)
+{
+ return -EINVAL;
+}
+
+static inline int acpi_mpam_count_msc(void) { return -EINVAL; }
+#endif
+
+static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
+ enum mpam_class_types type, u8 class_id,
+ int component_id)
+{
+ return -EINVAL;
+}
+
+#endif /* __LINUX_ARM_MPAM_H */
--
2.39.5
Hi, James, On 9/10/25 13:42, James Morse wrote: > Add code to parse the arm64 specific MPAM table, looking up the cache > level from the PPTT and feeding the end result into the MPAM driver. > > For now the MPAM hook mpam_ris_create() is stubbed out, but will update > the MPAM driver with optional discovered data. > > CC: Carl Worth <carl@os.amperecomputing.com> > Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en > Signed-off-by: James Morse <james.morse@arm.com> > > --- > Changes since v1: > * Whitespace. > * Gave GLOBAL_AFFINITY a pre-processor'd name. > * Fixed assumption that there are zero functional dependencies. > * Bounds check walking of the MSC RIS. > * More bounds checking in the main table walk. > * Check for nonsense numbers of function dependencies. > * Smattering of pr_debug() to help folk feeding line-noise to the parser. > * Changed the comment flavour on the SPDX string. > * Removed additional table check. > * More comment wrangling. > > Changes since RFC: > * Used DEFINE_RES_IRQ_NAMED() and friends macros. > * Additional error handling. > * Check for zero sized MSC. > * Allow table revisions greater than 1. (no spec for revision 0!) > * Use cleanup helpers to retrive ACPI tables, which allows some functions > to be folded together. > --- > arch/arm64/Kconfig | 1 + > drivers/acpi/arm64/Kconfig | 3 + > drivers/acpi/arm64/Makefile | 1 + > drivers/acpi/arm64/mpam.c | 361 ++++++++++++++++++++++++++++++++++++ > drivers/acpi/tables.c | 2 +- > include/linux/acpi.h | 12 ++ > include/linux/arm_mpam.h | 48 +++++ > 7 files changed, 427 insertions(+), 1 deletion(-) > create mode 100644 drivers/acpi/arm64/mpam.c > create mode 100644 include/linux/arm_mpam.h > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 4be8a13505bf..6487c511bdc6 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE > > config ARM64_MPAM > bool "Enable support for MPAM" > + select ACPI_MPAM if ACPI > help > Memory System Resource Partitioning and Monitoring (MPAM) is an > optional extension to the Arm architecture that allows each > diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig > index b3ed6212244c..f2fd79f22e7d 100644 > --- a/drivers/acpi/arm64/Kconfig > +++ b/drivers/acpi/arm64/Kconfig > @@ -21,3 +21,6 @@ config ACPI_AGDI > > config ACPI_APMT > bool > + > +config ACPI_MPAM > + bool > diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile > index 05ecde9eaabe..9390b57cb564 100644 > --- a/drivers/acpi/arm64/Makefile > +++ b/drivers/acpi/arm64/Makefile > @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o > obj-$(CONFIG_ACPI_FFH) += ffh.o > obj-$(CONFIG_ACPI_GTDT) += gtdt.o > obj-$(CONFIG_ACPI_IORT) += iort.o > +obj-$(CONFIG_ACPI_MPAM) += mpam.o > obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o > obj-$(CONFIG_ARM_AMBA) += amba.o > obj-y += dma.o init.o > diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c > new file mode 100644 > index 000000000000..fd9cfa143676 > --- /dev/null > +++ b/drivers/acpi/arm64/mpam.c > @@ -0,0 +1,361 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (C) 2025 Arm Ltd. > + > +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ > + > +#define pr_fmt(fmt) "ACPI MPAM: " fmt > + > +#include <linux/acpi.h> > +#include <linux/arm_mpam.h> > +#include <linux/bits.h> > +#include <linux/cpu.h> > +#include <linux/cpumask.h> > +#include <linux/platform_device.h> > + > +#include <acpi/processor.h> > + > +/* > + * Flags for acpi_table_mpam_msc.*_interrupt_flags. > + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. > + */ > +#define ACPI_MPAM_MSC_IRQ_MODE_MASK BIT(0) > +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) > +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 > +#define ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER BIT(3) > +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) > + > +static bool acpi_mpam_register_irq(struct platform_device *pdev, int intid, > + u32 flags, int *irq, > + u32 processor_container_uid) > +{ > + int sense; > + > + if (!intid) > + return false; > + > + if (FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags) != > + ACPI_MPAM_MSC_IRQ_TYPE_WIRED) > + return false; > + > + sense = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE_MASK, flags); > + > + if (16 <= intid && intid < 32 && processor_container_uid != GLOBAL_AFFINITY) { > + pr_err_once("Partitioned interrupts not supported\n"); > + return false; > + } > + > + *irq = acpi_register_gsi(&pdev->dev, intid, sense, ACPI_ACTIVE_HIGH); > + if (*irq <= 0) { > + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", > + intid); > + return false; > + } > + > + return true; > +} > + > +static void acpi_mpam_parse_irqs(struct platform_device *pdev, > + struct acpi_mpam_msc_node *tbl_msc, > + struct resource *res, int *res_idx) > +{ > + u32 flags, aff; > + int irq; > + > + flags = tbl_msc->overflow_interrupt_flags; > + if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID && > + flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER) > + aff = tbl_msc->overflow_interrupt_affinity; > + else > + aff = GLOBAL_AFFINITY; > + if (acpi_mpam_register_irq(pdev, tbl_msc->overflow_interrupt, flags, &irq, aff)) > + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow"); > + > + flags = tbl_msc->error_interrupt_flags; > + if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID && > + flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER) > + aff = tbl_msc->error_interrupt_affinity; > + else > + aff = GLOBAL_AFFINITY; > + if (acpi_mpam_register_irq(pdev, tbl_msc->error_interrupt, flags, &irq, aff)) > + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); > +} > + > +static int acpi_mpam_parse_resource(struct mpam_msc *msc, > + struct acpi_mpam_resource_node *res) > +{ > + int level, nid; > + u32 cache_id; > + > + switch (res->locator_type) { > + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: > + cache_id = res->locator.cache_locator.cache_reference; > + level = find_acpi_cache_level_from_id(cache_id); > + if (level <= 0) { > + pr_err_once("Bad level (%u) for cache with id %u\n", level, cache_id); Since level could be negative value here, printing it as %u converts it to positive value and will cause debug difficulty. For example, -ENOENT returned by find_acpi_cache_level_from_id() will be printed as 4294967294(instead of -2) which is hard to know the error code. Suggest to change this to %d: pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); > + return -EINVAL; > + } > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, > + level, cache_id); > + case ACPI_MPAM_LOCATION_TYPE_MEMORY: > + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); > + if (nid == NUMA_NO_NODE) > + nid = 0; > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, > + 255, nid); > + default: > + /* These get discovered later and treated as unknown */ > + return 0; > + } > +} > + > +int acpi_mpam_parse_resources(struct mpam_msc *msc, > + struct acpi_mpam_msc_node *tbl_msc) > +{ > + int i, err; > + char *ptr, *table_end; > + struct acpi_mpam_resource_node *resource; > + > + ptr = (char *)(tbl_msc + 1); > + table_end = ptr + tbl_msc->length; > + for (i = 0; i < tbl_msc->num_resource_nodes; i++) { > + u64 max_deps, remaining_table; > + > + if (ptr + sizeof(*resource) > table_end) > + return -EINVAL; > + > + resource = (struct acpi_mpam_resource_node *)ptr; > + > + remaining_table = table_end - ptr; > + max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps); > + if (resource->num_functional_deps > max_deps) { > + pr_debug("MSC has impossible number of functional dependencies\n"); > + return -EINVAL; > + } > + > + err = acpi_mpam_parse_resource(msc, resource); > + if (err) > + return err; > + > + ptr += sizeof(*resource); > + ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps); > + } > + > + return 0; > +} > + > +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, > + struct platform_device *pdev, > + u32 *acpi_id) > +{ > + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1]; > + bool acpi_id_valid = false; > + struct acpi_device *buddy; > + char uid[11]; > + int err; > + > + memset(&hid, 0, sizeof(hid)); > + memcpy(hid, &tbl_msc->hardware_id_linked_device, > + sizeof(tbl_msc->hardware_id_linked_device)); > + > + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { > + *acpi_id = tbl_msc->instance_id_linked_device; > + acpi_id_valid = true; > + } > + > + err = snprintf(uid, sizeof(uid), "%u", > + tbl_msc->instance_id_linked_device); > + if (err >= sizeof(uid)) { err could be negative error code. This error validation only checks size but not error code. Better to change it to if (err < 0 || err >= sizeof(uid)) [SNIP] Thanks. -Fenghua
On Wed, Sep 10, 2025 at 08:42:46PM +0000, James Morse wrote: > Add code to parse the arm64 specific MPAM table, looking up the cache > level from the PPTT and feeding the end result into the MPAM driver. > > For now the MPAM hook mpam_ris_create() is stubbed out, but will update > the MPAM driver with optional discovered data. > > CC: Carl Worth <carl@os.amperecomputing.com> > Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en > Signed-off-by: James Morse <james.morse@arm.com> I have not noticed anything wrong - just some remarks/questions below. > --- > Changes since v1: > * Whitespace. > * Gave GLOBAL_AFFINITY a pre-processor'd name. > * Fixed assumption that there are zero functional dependencies. > * Bounds check walking of the MSC RIS. > * More bounds checking in the main table walk. > * Check for nonsense numbers of function dependencies. > * Smattering of pr_debug() to help folk feeding line-noise to the parser. > * Changed the comment flavour on the SPDX string. > * Removed additional table check. > * More comment wrangling. > > Changes since RFC: > * Used DEFINE_RES_IRQ_NAMED() and friends macros. > * Additional error handling. > * Check for zero sized MSC. > * Allow table revisions greater than 1. (no spec for revision 0!) > * Use cleanup helpers to retrive ACPI tables, which allows some functions > to be folded together. > --- > arch/arm64/Kconfig | 1 + > drivers/acpi/arm64/Kconfig | 3 + > drivers/acpi/arm64/Makefile | 1 + > drivers/acpi/arm64/mpam.c | 361 ++++++++++++++++++++++++++++++++++++ > drivers/acpi/tables.c | 2 +- > include/linux/acpi.h | 12 ++ > include/linux/arm_mpam.h | 48 +++++ > 7 files changed, 427 insertions(+), 1 deletion(-) > create mode 100644 drivers/acpi/arm64/mpam.c > create mode 100644 include/linux/arm_mpam.h > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 4be8a13505bf..6487c511bdc6 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE > > config ARM64_MPAM > bool "Enable support for MPAM" > + select ACPI_MPAM if ACPI > help > Memory System Resource Partitioning and Monitoring (MPAM) is an > optional extension to the Arm architecture that allows each > diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig > index b3ed6212244c..f2fd79f22e7d 100644 > --- a/drivers/acpi/arm64/Kconfig > +++ b/drivers/acpi/arm64/Kconfig > @@ -21,3 +21,6 @@ config ACPI_AGDI > > config ACPI_APMT > bool > + > +config ACPI_MPAM > + bool > diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile > index 05ecde9eaabe..9390b57cb564 100644 > --- a/drivers/acpi/arm64/Makefile > +++ b/drivers/acpi/arm64/Makefile > @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o > obj-$(CONFIG_ACPI_FFH) += ffh.o > obj-$(CONFIG_ACPI_GTDT) += gtdt.o > obj-$(CONFIG_ACPI_IORT) += iort.o > +obj-$(CONFIG_ACPI_MPAM) += mpam.o > obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o > obj-$(CONFIG_ARM_AMBA) += amba.o > obj-y += dma.o init.o > diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c > new file mode 100644 > index 000000000000..fd9cfa143676 > --- /dev/null > +++ b/drivers/acpi/arm64/mpam.c > @@ -0,0 +1,361 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (C) 2025 Arm Ltd. > + > +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ > + > +#define pr_fmt(fmt) "ACPI MPAM: " fmt > + > +#include <linux/acpi.h> > +#include <linux/arm_mpam.h> > +#include <linux/bits.h> > +#include <linux/cpu.h> > +#include <linux/cpumask.h> > +#include <linux/platform_device.h> > + > +#include <acpi/processor.h> > + > +/* > + * Flags for acpi_table_mpam_msc.*_interrupt_flags. > + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. > + */ > +#define ACPI_MPAM_MSC_IRQ_MODE_MASK BIT(0) > +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) > +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 > +#define ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER BIT(3) Nit: For consistency, not sure why MODE/TYPE_MASK and AFFINITY_{TYPE/VALID} aren't but that's not a big deal. BIT(3) to be consistent with the table should be (?) #define ACPI_MPAM_MSC_AFFINITY_TYPE_MASK BIT(3) to match the Table 5 (and then add defines for possible values). > +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) ACPI_MPAM_MSC_AFFINITY_VALID_MASK ? (or remove _MASK from IRQ_{MODE/TYPE) ? Just noticed - feel free to ignore this altogether. > +static bool acpi_mpam_register_irq(struct platform_device *pdev, int intid, > + u32 flags, int *irq, > + u32 processor_container_uid) > +{ > + int sense; > + > + if (!intid) > + return false; > + > + if (FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags) != > + ACPI_MPAM_MSC_IRQ_TYPE_WIRED) > + return false; > + > + sense = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE_MASK, flags); > + > + if (16 <= intid && intid < 32 && processor_container_uid != GLOBAL_AFFINITY) { > + pr_err_once("Partitioned interrupts not supported\n"); > + return false; > + } Please add a comment to explain what you mean here (ie PPIs partitioning isn't supported). I will have to change this code anyway to cater for the GICv5 interrupt model given the hardcoded intid values. Is the condition allowed by the MPAM architecture so the MPAM table are legitimate (but not supported in Linux) ? > + > + *irq = acpi_register_gsi(&pdev->dev, intid, sense, ACPI_ACTIVE_HIGH); > + if (*irq <= 0) { > + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", > + intid); > + return false; > + } > + > + return true; > +} > + > +static void acpi_mpam_parse_irqs(struct platform_device *pdev, > + struct acpi_mpam_msc_node *tbl_msc, > + struct resource *res, int *res_idx) > +{ > + u32 flags, aff; > + int irq; > + > + flags = tbl_msc->overflow_interrupt_flags; > + if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID && > + flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER) > + aff = tbl_msc->overflow_interrupt_affinity; > + else > + aff = GLOBAL_AFFINITY; > + if (acpi_mpam_register_irq(pdev, tbl_msc->overflow_interrupt, flags, &irq, aff)) > + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow"); > + > + flags = tbl_msc->error_interrupt_flags; > + if (flags & ACPI_MPAM_MSC_IRQ_AFFINITY_VALID && > + flags & ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER) > + aff = tbl_msc->error_interrupt_affinity; > + else > + aff = GLOBAL_AFFINITY; > + if (acpi_mpam_register_irq(pdev, tbl_msc->error_interrupt, flags, &irq, aff)) > + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); > +} > + > +static int acpi_mpam_parse_resource(struct mpam_msc *msc, > + struct acpi_mpam_resource_node *res) > +{ > + int level, nid; > + u32 cache_id; > + > + switch (res->locator_type) { > + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: > + cache_id = res->locator.cache_locator.cache_reference; > + level = find_acpi_cache_level_from_id(cache_id); > + if (level <= 0) { > + pr_err_once("Bad level (%u) for cache with id %u\n", level, cache_id); > + return -EINVAL; > + } > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, > + level, cache_id); > + case ACPI_MPAM_LOCATION_TYPE_MEMORY: > + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); > + if (nid == NUMA_NO_NODE) > + nid = 0; > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, > + 255, nid); > + default: > + /* These get discovered later and treated as unknown */ > + return 0; > + } > +} > + > +int acpi_mpam_parse_resources(struct mpam_msc *msc, > + struct acpi_mpam_msc_node *tbl_msc) > +{ > + int i, err; > + char *ptr, *table_end; > + struct acpi_mpam_resource_node *resource; > + > + ptr = (char *)(tbl_msc + 1); > + table_end = ptr + tbl_msc->length; > + for (i = 0; i < tbl_msc->num_resource_nodes; i++) { > + u64 max_deps, remaining_table; > + > + if (ptr + sizeof(*resource) > table_end) > + return -EINVAL; > + > + resource = (struct acpi_mpam_resource_node *)ptr; > + > + remaining_table = table_end - ptr; > + max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps); > + if (resource->num_functional_deps > max_deps) { > + pr_debug("MSC has impossible number of functional dependencies\n"); > + return -EINVAL; > + } > + > + err = acpi_mpam_parse_resource(msc, resource); > + if (err) > + return err; > + > + ptr += sizeof(*resource); > + ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps); > + } > + > + return 0; > +} > + > +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, > + struct platform_device *pdev, > + u32 *acpi_id) > +{ > + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1]; > + bool acpi_id_valid = false; > + struct acpi_device *buddy; > + char uid[11]; > + int err; > + > + memset(&hid, 0, sizeof(hid)); Jonathan already commented on this. > + memcpy(hid, &tbl_msc->hardware_id_linked_device, > + sizeof(tbl_msc->hardware_id_linked_device)); > + > + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { > + *acpi_id = tbl_msc->instance_id_linked_device; > + acpi_id_valid = true; > + } > + > + err = snprintf(uid, sizeof(uid), "%u", > + tbl_msc->instance_id_linked_device); > + if (err >= sizeof(uid)) { > + pr_debug("Failed to convert uid of device for power management."); > + return acpi_id_valid; > + } > + > + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); > + if (buddy) > + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); Is !buddy a FW error to be logged ? > + > + return acpi_id_valid; > +} > + > +static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc, > + enum mpam_msc_iface *iface) > +{ > + switch (tbl_msc->interface_type) { > + case 0: > + *iface = MPAM_IFACE_MMIO; > + return 0; > + case 0xa: Worth giving those constants 0x0,0xa a name ? > + *iface = MPAM_IFACE_PCC; > + return 0; > + default: > + return -EINVAL; > + } > +} > + > +static int __init acpi_mpam_parse(void) > +{ > + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); > + char *table_end, *table_offset = (char *)(table + 1); > + struct property_entry props[4]; /* needs a sentinel */ > + struct acpi_mpam_msc_node *tbl_msc; > + int next_res, next_prop, err = 0; > + struct acpi_device *companion; > + struct platform_device *pdev; > + enum mpam_msc_iface iface; > + struct resource res[3]; > + char uid[16]; > + u32 acpi_id; > + > + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) > + return 0; > + > + if (table->revision < 1) > + return 0; > + > + table_end = (char *)table + table->length; > + > + while (table_offset < table_end) { > + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; > + table_offset += tbl_msc->length; > + > + if (table_offset > table_end) { > + pr_debug("MSC entry overlaps end of ACPI table\n"); > + break; > + } > + > + /* > + * If any of the reserved fields are set, make no attempt to > + * parse the MSC structure. This MSC will still be counted, > + * meaning the MPAM driver can't probe against all MSC, and > + * will never be enabled. There is no way to enable it safely, > + * because we cannot determine safe system-wide partid and pmg > + * ranges in this situation. > + */ > + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { > + pr_err_once("Unrecognised MSC, MPAM not usable\n"); > + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); > + continue; > + } This is a bit obscure - the comment too requires some explanation ("This MSC will still be counted", not very clear what that means). > + > + if (!tbl_msc->mmio_size) { > + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); > + continue; > + } > + > + if (decode_interface_type(tbl_msc, &iface)) { > + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); > + continue; > + } > + > + next_res = 0; > + next_prop = 0; > + memset(res, 0, sizeof(res)); > + memset(props, 0, sizeof(props)); > + > + pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier); > + if (!pdev) { > + err = -ENOMEM; > + break; > + } > + > + if (tbl_msc->length < sizeof(*tbl_msc)) { > + err = -EINVAL; > + break; > + } > + > + /* Some power management is described in the namespace: */ > + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); > + if (err > 0 && err < sizeof(uid)) { > + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); > + if (companion) > + ACPI_COMPANION_SET(&pdev->dev, companion); > + else > + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); Here you are linking the platform device to a namespace companion. That's what will make sure that a) the ACPI namespace scan won't add an additional platform device for ARMHAA5C and b) MSIs works -> through the related IORT named component. Correct ? > + } > + > + if (iface == MPAM_IFACE_MMIO) { > + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, > + tbl_msc->mmio_size, > + "MPAM:MSC"); > + } else if (iface == MPAM_IFACE_PCC) { > + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", > + tbl_msc->base_address); > + next_prop++; > + } > + > + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); Do we _really_ have to resolve IRQs here or we can postpone them at driver probe time like RIS resources (if I understand correctly how it is done - by copying table data into platform data) ? GICv5 hat in mind - good as it is for GICv3. > + err = platform_device_add_resources(pdev, res, next_res); > + if (err) > + break; > + > + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", > + tbl_msc->max_nrdy_usec); > + > + /* > + * The MSC's CPU affinity is described via its linked power > + * management device, but only if it points at a Processor or > + * Processor Container. > + */ > + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) { > + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", > + acpi_id); > + } > + > + err = device_create_managed_software_node(&pdev->dev, props, > + NULL); > + if (err) > + break; > + > + /* Come back later if you want the RIS too */ I read this as: copy table data to the device so that RIS resources can be parsed later. Right ? I think it is worth updating the comment to clarify it. > + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); > + if (err) > + break; > + > + err = platform_device_add(pdev); > + if (err) > + break; > + } > + > + if (err) > + platform_device_put(pdev); I won't comment on the clean-up here as Jonathan did it already. > + > + return err; > +} > + > +int acpi_mpam_count_msc(void) > +{ > + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); > + char *table_end, *table_offset = (char *)(table + 1); > + struct acpi_mpam_msc_node *tbl_msc; > + int count = 0; > + > + if (IS_ERR(table)) > + return 0; > + > + if (table->revision < 1) > + return 0; > + > + table_end = (char *)table + table->length; > + > + while (table_offset < table_end) { > + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; > + if (!tbl_msc->mmio_size) > + continue; > + > + if (tbl_msc->length < sizeof(*tbl_msc)) > + return -EINVAL; > + if (tbl_msc->length > table_end - table_offset) > + return -EINVAL; > + table_offset += tbl_msc->length; > + > + count++; > + } > + > + return count; > +} > + > +/* > + * Call after ACPI devices have been created, which happens behind acpi_scan_init() > + * called from subsys_initcall(). PCC requires the mailbox driver, which is > + * initialised from postcore_initcall(). I think we will end up setting in stone init ordering for these components created out of static tables (I mean sequencing them in a centralized way) but if that works for the current driver that's fine for the time being. > + */ > +subsys_initcall_sync(acpi_mpam_parse); > diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c > index fa9bb8c8ce95..835e3795ede3 100644 > --- a/drivers/acpi/tables.c > +++ b/drivers/acpi/tables.c > @@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst > ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, > ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT, > ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI, > - ACPI_SIG_NBFT }; > + ACPI_SIG_NBFT, ACPI_SIG_MPAM }; > > #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header) > > diff --git a/include/linux/acpi.h b/include/linux/acpi.h > index c5fd92cda487..af449964426b 100644 > --- a/include/linux/acpi.h > +++ b/include/linux/acpi.h > @@ -8,6 +8,7 @@ > #ifndef _LINUX_ACPI_H > #define _LINUX_ACPI_H > > +#include <linux/cleanup.h> > #include <linux/errno.h> > #include <linux/ioport.h> /* for struct resource */ > #include <linux/resource_ext.h> > @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); > void acpi_table_init_complete (void); > int acpi_table_init (void); > > +static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance) > +{ > + struct acpi_table_header *table; > + int status = acpi_get_table(signature, instance, &table); > + > + if (ACPI_FAILURE(status)) > + return ERR_PTR(-ENOENT); > + return table; > +} > +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) Jonathan commented on this already - worth getting Rafael's opinion, I am fine either way. I have not found anything that should block this code (other than code that I know I will have to rework when GICv5 ACPI support gets in) so: Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org> > + > int acpi_table_parse(char *id, acpi_tbl_table_handler handler); > int __init_or_acpilib acpi_table_parse_entries(char *id, > unsigned long table_size, int entry_id, > diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h > new file mode 100644 > index 000000000000..3d6c39c667c3 > --- /dev/null > +++ b/include/linux/arm_mpam.h > @@ -0,0 +1,48 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* Copyright (C) 2025 Arm Ltd. */ > + > +#ifndef __LINUX_ARM_MPAM_H > +#define __LINUX_ARM_MPAM_H > + > +#include <linux/acpi.h> > +#include <linux/types.h> > + > +#define GLOBAL_AFFINITY ~0 > + > +struct mpam_msc; > + > +enum mpam_msc_iface { > + MPAM_IFACE_MMIO, /* a real MPAM MSC */ > + MPAM_IFACE_PCC, /* a fake MPAM MSC */ > +}; > + > +enum mpam_class_types { > + MPAM_CLASS_CACHE, /* Well known caches, e.g. L2 */ > + MPAM_CLASS_MEMORY, /* Main memory */ > + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ > +}; > + > +#ifdef CONFIG_ACPI_MPAM > +/* Parse the ACPI description of resources entries for this MSC. */ > +int acpi_mpam_parse_resources(struct mpam_msc *msc, > + struct acpi_mpam_msc_node *tbl_msc); > + > +int acpi_mpam_count_msc(void); > +#else > +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, > + struct acpi_mpam_msc_node *tbl_msc) > +{ > + return -EINVAL; > +} > + > +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } > +#endif > + > +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, > + enum mpam_class_types type, u8 class_id, > + int component_id) > +{ > + return -EINVAL; > +} > + > +#endif /* __LINUX_ARM_MPAM_H */ > -- > 2.39.5 >
Hi Lorenzo, On 11/09/2025 15:56, Lorenzo Pieralisi wrote: > On Wed, Sep 10, 2025 at 08:42:46PM +0000, James Morse wrote: >> Add code to parse the arm64 specific MPAM table, looking up the cache >> level from the PPTT and feeding the end result into the MPAM driver. >> >> For now the MPAM hook mpam_ris_create() is stubbed out, but will update >> the MPAM driver with optional discovered data. >> diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c >> new file mode 100644 >> index 000000000000..fd9cfa143676 >> --- /dev/null >> +++ b/drivers/acpi/arm64/mpam.c >> @@ -0,0 +1,361 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +// Copyright (C) 2025 Arm Ltd. >> + >> +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ >> + >> +#define pr_fmt(fmt) "ACPI MPAM: " fmt >> + >> +#include <linux/acpi.h> >> +#include <linux/arm_mpam.h> >> +#include <linux/bits.h> >> +#include <linux/cpu.h> >> +#include <linux/cpumask.h> >> +#include <linux/platform_device.h> >> + >> +#include <acpi/processor.h> >> + >> +/* >> + * Flags for acpi_table_mpam_msc.*_interrupt_flags. >> + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. >> + */ >> +#define ACPI_MPAM_MSC_IRQ_MODE_MASK BIT(0) >> +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) >> +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 >> +#define ACPI_MPAM_MSC_IRQ_AFFINITY_PROCESSOR_CONTAINER BIT(3) > Nit: For consistency, not sure why MODE/TYPE_MASK and > AFFINITY_{TYPE/VALID} aren't but that's not a big deal. The missing word is giving me trouble working out what this should be... There used to be definitions for MODE_LEVEL and MODE_EDGE, hence the MODE_MASK to extract the bit - but Jonathan pointed out the polarity values were pretty standard, and ocul be fed to acpi_register_gsi() directly without mapping 0->0 and 1->1. > BIT(3) to be consistent with the table should be (?) > > #define ACPI_MPAM_MSC_AFFINITY_TYPE_MASK BIT(3) > > to match the Table 5 (and then add defines for possible values). Fixed as: | #define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK BIT(3) | #define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR 0 | #define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1 >> +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) > > ACPI_MPAM_MSC_AFFINITY_VALID_MASK ? (or remove _MASK from IRQ_{MODE/TYPE) ? > > Just noticed - feel free to ignore this altogether. I think the _MASK stuff encourages use of FIELD_GET(), which in turn encourages local variables with meaningful names. But I don't think I need a mask and values defined for the valid bit - which is readable enough already, or the mode as the bits are passed through directly. > >> +static bool acpi_mpam_register_irq(struct platform_device *pdev, int intid, >> + u32 flags, int *irq, >> + u32 processor_container_uid) >> +{ >> + int sense; >> + >> + if (!intid) >> + return false; >> + >> + if (FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags) != >> + ACPI_MPAM_MSC_IRQ_TYPE_WIRED) >> + return false; >> + >> + sense = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE_MASK, flags); >> + >> + if (16 <= intid && intid < 32 && processor_container_uid != GLOBAL_AFFINITY) { >> + pr_err_once("Partitioned interrupts not supported\n"); >> + return false; >> + } > > Please add a comment to explain what you mean here (ie PPIs partitioning > isn't supported). The error message isn't enough? > I will have to change this code anyway to cater for the GICv5 interrupt model given the > hardcoded intid values. Ah. I can probably detect PPIs from the table description instead. It's done like this just because this was where the handling logic used to be, but if that's a nuisance for the GICv5 handling, I'll do it purely from the table. I have the code to support ppi-paritions, but its not worth it unless someone builds this. > Is the condition allowed by the MPAM architecture so the MPAM table are > legitimate (but not supported in Linux) ? Yeah, the architecture says the interrupt can be PPI. Global PPI are supported, but not partitions on ACPI platforms. Because its valid/easy for DT, its hard to say no-one will ever do this with ACPI. Describing the affinity in the table lets the OS decide whether to support it. This ends up as as helper: | static bool _is_ppi_partition(u32 flags) | { | u32 aff_type, is_ppi; | bool ret; | | is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags); | if (!is_ppi) | return false; | | aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags); | ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER); | if (ret) | pr_err_once("Partitioned interrupts not supported\n"); | | return ret; | } and a call in acpi_mpam_register_irq() to return false for ppi-partitions. This also allows some simplification in acpi_mpam_parse_irqs(). >> + >> + *irq = acpi_register_gsi(&pdev->dev, intid, sense, ACPI_ACTIVE_HIGH); >> + if (*irq <= 0) { >> + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", >> + intid); >> + return false; >> + } >> + >> + return true; >> +} >> +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, >> + struct platform_device *pdev, >> + u32 *acpi_id) >> +{ >> + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1]; >> + bool acpi_id_valid = false; >> + struct acpi_device *buddy; >> + char uid[11]; >> + int err; >> + >> + memset(&hid, 0, sizeof(hid)); > > Jonathan already commented on this. Yup, its gone. >> + memcpy(hid, &tbl_msc->hardware_id_linked_device, >> + sizeof(tbl_msc->hardware_id_linked_device)); >> + >> + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { >> + *acpi_id = tbl_msc->instance_id_linked_device; >> + acpi_id_valid = true; >> + } >> + >> + err = snprintf(uid, sizeof(uid), "%u", >> + tbl_msc->instance_id_linked_device); >> + if (err >= sizeof(uid)) { >> + pr_debug("Failed to convert uid of device for power management."); >> + return acpi_id_valid; >> + } >> + >> + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); >> + if (buddy) >> + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); > Is !buddy a FW error to be logged ? I'm pretty sure that field is optional, its for platform specific power management. The spec has "This field must be set to zero if there is no linked device for this MSC". This is where I expect a search for "\0\0\0\0" to fail, hence its silent. I think this thing is for the power management of the parent device, whereas the MSC namespace object is for the power management of the MSC itself. I've no idea why they need to be separate... >> + >> + return acpi_id_valid; >> +} >> + >> +static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc, >> + enum mpam_msc_iface *iface) >> +{ >> + switch (tbl_msc->interface_type) { >> + case 0: >> + *iface = MPAM_IFACE_MMIO; >> + return 0; >> + case 0xa: > > Worth giving those constants 0x0,0xa a name ? Sure, added earlier in the file: | /* | * Encodings for the MSC node body interface type field. | * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet. | */ | #define ACPI_MPAM_MSC_IFACE_MMIO 0x00 | #define ACPI_MPAM_MSC_IFACE_PCC 0x0a > >> + *iface = MPAM_IFACE_PCC; >> + return 0; >> + default: >> + return -EINVAL; >> + } >> +} >> + >> +static int __init acpi_mpam_parse(void) >> +{ >> + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); >> + char *table_end, *table_offset = (char *)(table + 1); >> + struct property_entry props[4]; /* needs a sentinel */ >> + struct acpi_mpam_msc_node *tbl_msc; >> + int next_res, next_prop, err = 0; >> + struct acpi_device *companion; >> + struct platform_device *pdev; >> + enum mpam_msc_iface iface; >> + struct resource res[3]; >> + char uid[16]; >> + u32 acpi_id; >> + >> + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) >> + return 0; >> + >> + if (table->revision < 1) >> + return 0; >> + >> + table_end = (char *)table + table->length; >> + >> + while (table_offset < table_end) { >> + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; >> + table_offset += tbl_msc->length; >> + >> + if (table_offset > table_end) { >> + pr_debug("MSC entry overlaps end of ACPI table\n"); >> + break; >> + } >> + >> + /* >> + * If any of the reserved fields are set, make no attempt to >> + * parse the MSC structure. This MSC will still be counted, >> + * meaning the MPAM driver can't probe against all MSC, and >> + * will never be enabled. There is no way to enable it safely, >> + * because we cannot determine safe system-wide partid and pmg >> + * ranges in this situation. >> + */ >> + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { >> + pr_err_once("Unrecognised MSC, MPAM not usable\n"); >> + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); >> + continue; >> + } > This is a bit obscure - the comment too requires some explanation > ("This MSC will still be counted", not very clear what that means). I'll expand it - counted by acpi_mpam_count_msc(). It's trying to explain why its perfectly safe to just skip MSC here - that prevents the driver from ever touching the hardware. >> + >> + if (!tbl_msc->mmio_size) { >> + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); >> + continue; >> + } >> + >> + if (decode_interface_type(tbl_msc, &iface)) { >> + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); >> + continue; >> + } >> + >> + next_res = 0; >> + next_prop = 0; >> + memset(res, 0, sizeof(res)); >> + memset(props, 0, sizeof(props)); >> + >> + pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier); >> + if (!pdev) { >> + err = -ENOMEM; >> + break; >> + } >> + >> + if (tbl_msc->length < sizeof(*tbl_msc)) { >> + err = -EINVAL; >> + break; >> + } >> + >> + /* Some power management is described in the namespace: */ >> + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); >> + if (err > 0 && err < sizeof(uid)) { >> + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); >> + if (companion) >> + ACPI_COMPANION_SET(&pdev->dev, companion); >> + else >> + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); > > Here you are linking the platform device to a namespace companion. > That's what will make sure that a) the ACPI namespace scan won't add an > additional platform device for ARMHAA5C and b) MSIs works -> through > the related IORT named component. > > Correct ? I don't think anyone would put an MSC behind an IOMMU, so hopefully this is not needed for MSI. I've not touched the MSI support yet. 2.6 of the spec says this is for power management ... I've hooked it up here because its there. I assumed the power-management core needed to know about it. It's the same device, hence calling it the companion, (which is how the spec people referred to it verbally). I do see a the namespace platform device get created, is that not supposed to happen? I'm testing this with the FVP, so don't have a way of actually triggering a power management flow. >> + } >> + >> + if (iface == MPAM_IFACE_MMIO) { >> + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, >> + tbl_msc->mmio_size, >> + "MPAM:MSC"); >> + } else if (iface == MPAM_IFACE_PCC) { >> + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", >> + tbl_msc->base_address); >> + next_prop++; >> + } >> + >> + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); > > Do we _really_ have to resolve IRQs here or we can postpone them at > driver probe time like RIS resources (if I understand correctly how > it is done - by copying table data into platform data) ? Simply because that is the order that DT does it too. We'd probably get away with it, but the interrupts are a property of the MSC, not the description of the resource it controls... I'm not sure what doing this would buy us. > GICv5 hat in mind - good as it is for GICv3. What changes here for GICv5? I certainly want it to work with whatever irqchip is in use. >> + err = platform_device_add_resources(pdev, res, next_res); >> + if (err) >> + break; >> + >> + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", >> + tbl_msc->max_nrdy_usec); >> + >> + /* >> + * The MSC's CPU affinity is described via its linked power >> + * management device, but only if it points at a Processor or >> + * Processor Container. >> + */ >> + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) { >> + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", >> + acpi_id); >> + } >> + >> + err = device_create_managed_software_node(&pdev->dev, props, >> + NULL); >> + if (err) >> + break; >> + >> + /* Come back later if you want the RIS too */ > > I read this as: copy table data to the device so that RIS resources can > be parsed later. > > Right ? I think it is worth updating the comment to clarify it. Sure, /* * Stash the table entry for acpi_mpam_parse_resources() to discover * what this MSC controls. */ >> + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); >> + if (err) >> + break; >> + >> + err = platform_device_add(pdev); >> + if (err) >> + break; >> + } >> + >> + if (err) >> + platform_device_put(pdev); > > I won't comment on the clean-up here as Jonathan did it already. > >> + >> + return err; >> +} >> + >> +/* >> + * Call after ACPI devices have been created, which happens behind acpi_scan_init() >> + * called from subsys_initcall(). PCC requires the mailbox driver, which is >> + * initialised from postcore_initcall(). > I think we will end up setting in stone init ordering for these > components created out of static tables (I mean sequencing them in a > centralized way) but if that works for the current driver that's fine > for the time being. That'd be better - I've noted the dependencies on each one of these. >> diff --git a/include/linux/acpi.h b/include/linux/acpi.h >> index c5fd92cda487..af449964426b 100644 >> --- a/include/linux/acpi.h >> +++ b/include/linux/acpi.h >> @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); >> void acpi_table_init_complete (void); >> int acpi_table_init (void); >> >> +static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance) >> +{ >> + struct acpi_table_header *table; >> + int status = acpi_get_table(signature, instance, &table); >> + >> + if (ACPI_FAILURE(status)) >> + return ERR_PTR(-ENOENT); >> + return table; >> +} >> +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) > > Jonathan commented on this already - worth getting Rafael's opinion, > I am fine either way. > > I have not found anything that should block this code (other than code > that I know I will have to rework when GICv5 ACPI support gets in) so: I've factored that out. > Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org> From your comments I assume you're happy with Jonathan's suggested changes too - which I've picked up. Thanks! James
On Wed, 10 Sep 2025 20:42:46 +0000 James Morse <james.morse@arm.com> wrote: > Add code to parse the arm64 specific MPAM table, looking up the cache > level from the PPTT and feeding the end result into the MPAM driver. > > For now the MPAM hook mpam_ris_create() is stubbed out, but will update > the MPAM driver with optional discovered data. > > CC: Carl Worth <carl@os.amperecomputing.com> > Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en > Signed-off-by: James Morse <james.morse@arm.com> > Hi James, A few comments inline. Note I've more or less completely forgotten what was discussed in RFC 1 so I might well be repeating stuff that you replied to then. Always a problem for me with big complex patch sets! Jonathan > diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c > new file mode 100644 > index 000000000000..fd9cfa143676 > --- /dev/null > +++ b/drivers/acpi/arm64/mpam.c > +static int acpi_mpam_parse_resource(struct mpam_msc *msc, > + struct acpi_mpam_resource_node *res) > +{ > + int level, nid; > + u32 cache_id; > + > + switch (res->locator_type) { > + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: > + cache_id = res->locator.cache_locator.cache_reference; > + level = find_acpi_cache_level_from_id(cache_id); > + if (level <= 0) { > + pr_err_once("Bad level (%u) for cache with id %u\n", level, cache_id); > + return -EINVAL; > + } > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, > + level, cache_id); > + case ACPI_MPAM_LOCATION_TYPE_MEMORY: > + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); > + if (nid == NUMA_NO_NODE) > + nid = 0; > + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, > + 255, nid); > + default: > + /* These get discovered later and treated as unknown */ are treated? > + return 0; > + } > +} > +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, > + struct platform_device *pdev, > + u32 *acpi_id) > +{ > + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1]; > + bool acpi_id_valid = false; > + struct acpi_device *buddy; > + char uid[11]; > + int err; > + > + memset(&hid, 0, sizeof(hid)); = {}; above and skip the memset. > + memcpy(hid, &tbl_msc->hardware_id_linked_device, > + sizeof(tbl_msc->hardware_id_linked_device)); > + > + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { > + *acpi_id = tbl_msc->instance_id_linked_device; > + acpi_id_valid = true; > + } > + > + err = snprintf(uid, sizeof(uid), "%u", > + tbl_msc->instance_id_linked_device); > + if (err >= sizeof(uid)) { > + pr_debug("Failed to convert uid of device for power management."); > + return acpi_id_valid; > + } > + > + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); > + if (buddy) > + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); > + > + return acpi_id_valid; > +} > + > +static int __init acpi_mpam_parse(void) > +{ > + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); > + char *table_end, *table_offset = (char *)(table + 1); > + struct property_entry props[4]; /* needs a sentinel */ Perhaps move this and res into the loop and use = {}; > + struct acpi_mpam_msc_node *tbl_msc; > + int next_res, next_prop, err = 0; > + struct acpi_device *companion; > + struct platform_device *pdev; > + enum mpam_msc_iface iface; > + struct resource res[3]; Add a comment here or a check later on why this is large enough. > + char uid[16]; > + u32 acpi_id; > + > + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) > + return 0; > + > + if (table->revision < 1) > + return 0; > + > + table_end = (char *)table + table->length; > + > + while (table_offset < table_end) { > + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; > + table_offset += tbl_msc->length; > + > + if (table_offset > table_end) { > + pr_debug("MSC entry overlaps end of ACPI table\n"); > + break; That this isn't considered an error is a bit subtle and made me wonder if there was a use of uninitialized pdev (there isn't because err == 0) Why not return here? > + } > + > + /* > + * If any of the reserved fields are set, make no attempt to > + * parse the MSC structure. This MSC will still be counted, > + * meaning the MPAM driver can't probe against all MSC, and > + * will never be enabled. There is no way to enable it safely, > + * because we cannot determine safe system-wide partid and pmg > + * ranges in this situation. > + */ This is decidedly paranoid. I'd normally expect the architecture to be based on assumption that is fine for old software to ignore new fields. ACPI itself has fairly firm rules on this (though it goes wrong sometimes :) I'm guessing there is something out there that made this necessary though so keep it if you actually need it. > + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { > + pr_err_once("Unrecognised MSC, MPAM not usable\n"); > + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); > + continue; > + } > + > + if (!tbl_msc->mmio_size) { > + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); > + continue; > + } > + > + if (decode_interface_type(tbl_msc, &iface)) { > + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); > + continue; > + } > + > + next_res = 0; > + next_prop = 0; > + memset(res, 0, sizeof(res)); > + memset(props, 0, sizeof(props)); > + > + pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier); https://lore.kernel.org/all/20241009124120.1124-13-shiju.jose@huawei.com/ was a proposal to add a DEFINE_FREE() to clean these up. Might be worth a revisit. Then Greg was against the use it was put to and asking for an example of where it helped. Maybe this is that example. If you do want to do that, I'd factor out a bunch of the stuff here as a helper so we can have the clean ownership pass of a return_ptr(). Similar to what Shiju did here (this is the usecase for platform device that Greg didn't like). https://lore.kernel.org/all/20241009124120.1124-14-shiju.jose@huawei.com/ Even without that I think factoring some of this out and hence being able to do returns on errors and put the if (err) into the loop would be a nice improvement to readability. > + if (!pdev) { > + err = -ENOMEM; > + break; > + } > + > + if (tbl_msc->length < sizeof(*tbl_msc)) { > + err = -EINVAL; > + break; > + } > + > + /* Some power management is described in the namespace: */ > + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); > + if (err > 0 && err < sizeof(uid)) { > + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); > + if (companion) > + ACPI_COMPANION_SET(&pdev->dev, companion); > + else > + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); > + } > + > + if (iface == MPAM_IFACE_MMIO) { > + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, > + tbl_msc->mmio_size, > + "MPAM:MSC"); > + } else if (iface == MPAM_IFACE_PCC) { > + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", > + tbl_msc->base_address); > + next_prop++; Why the double increment? Needs a comment if that is right thing to do. > + } > + > + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); > + err = platform_device_add_resources(pdev, res, next_res); > + if (err) > + break; > + > + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", > + tbl_msc->max_nrdy_usec); > + > + /* > + * The MSC's CPU affinity is described via its linked power > + * management device, but only if it points at a Processor or > + * Processor Container. > + */ > + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) { > + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", > + acpi_id); > + } > + > + err = device_create_managed_software_node(&pdev->dev, props, > + NULL); > + if (err) > + break; > + > + /* Come back later if you want the RIS too */ > + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); > + if (err) > + break; > + > + err = platform_device_add(pdev); > + if (err) > + break; > + } > + > + if (err) > + platform_device_put(pdev); > + > + return err; > +} > + > +int acpi_mpam_count_msc(void) > +{ > + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); > + char *table_end, *table_offset = (char *)(table + 1); > + struct acpi_mpam_msc_node *tbl_msc; > + int count = 0; > + > + if (IS_ERR(table)) > + return 0; > + > + if (table->revision < 1) > + return 0; > + > + table_end = (char *)table + table->length; Trivial so feel free to ignore. Perhaps should aim for consistency. Whilst I prefer pointers for this stuff PPTT did use unsigned longs. > + > + while (table_offset < table_end) { > + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; > + if (!tbl_msc->mmio_size) > + continue; > + > + if (tbl_msc->length < sizeof(*tbl_msc)) > + return -EINVAL; > + if (tbl_msc->length > table_end - table_offset) > + return -EINVAL; > + table_offset += tbl_msc->length; > + > + count++; > + } > + > + return count; > +} > + Could reorder to put acpi_mpam_parse and this use of it together? > +/* > + * Call after ACPI devices have been created, which happens behind acpi_scan_init() > + * called from subsys_initcall(). PCC requires the mailbox driver, which is > + * initialised from postcore_initcall(). > + */ > +subsys_initcall_sync(acpi_mpam_parse); > diff --git a/include/linux/acpi.h b/include/linux/acpi.h > index c5fd92cda487..af449964426b 100644 > --- a/include/linux/acpi.h > +++ b/include/linux/acpi.h > @@ -8,6 +8,7 @@ > #ifndef _LINUX_ACPI_H > #define _LINUX_ACPI_H > > +#include <linux/cleanup.h> > #include <linux/errno.h> > #include <linux/ioport.h> /* for struct resource */ > #include <linux/resource_ext.h> > @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); > void acpi_table_init_complete (void); > int acpi_table_init (void); > > +static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance) > +{ > + struct acpi_table_header *table; > + int status = acpi_get_table(signature, instance, &table); > + > + if (ACPI_FAILURE(status)) > + return ERR_PTR(-ENOENT); > + return table; > +} > +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) I'd use if (!IS_ERR_OR_NULL(_T)) not because it is functionally necessary but because it will let the compiler optimize this out if it can tell that in a given path _T is NULL (I think it was Peter Z who pointed this out in a similar interface a while back). I'd like an opinion from Rafael on this in general. > + > int acpi_table_parse(char *id, acpi_tbl_table_handler handler); > int __init_or_acpilib acpi_table_parse_entries(char *id, > unsigned long table_size, int entry_id,
Hi Jonathan, On 11/09/2025 14:17, Jonathan Cameron wrote: > On Wed, 10 Sep 2025 20:42:46 +0000 > James Morse <james.morse@arm.com> wrote: > >> Add code to parse the arm64 specific MPAM table, looking up the cache >> level from the PPTT and feeding the end result into the MPAM driver. >> >> For now the MPAM hook mpam_ris_create() is stubbed out, but will update >> the MPAM driver with optional discovered data. > A few comments inline. Note I've more or less completely forgotten > what was discussed in RFC 1 so I might well be repeating stuff that > you replied to then. Always a problem for me with big complex patch sets! No worries - I'll have forgotten too. If we can't work it out, it should have had a comment. >> diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c >> new file mode 100644 >> index 000000000000..fd9cfa143676 >> --- /dev/null >> +++ b/drivers/acpi/arm64/mpam.c > > >> +static int acpi_mpam_parse_resource(struct mpam_msc *msc, >> + struct acpi_mpam_resource_node *res) >> +{ >> + int level, nid; >> + u32 cache_id; >> + >> + switch (res->locator_type) { >> + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: >> + cache_id = res->locator.cache_locator.cache_reference; >> + level = find_acpi_cache_level_from_id(cache_id); >> + if (level <= 0) { >> + pr_err_once("Bad level (%u) for cache with id %u\n", level, cache_id); >> + return -EINVAL; >> + } >> + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, >> + level, cache_id); >> + case ACPI_MPAM_LOCATION_TYPE_MEMORY: >> + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); >> + if (nid == NUMA_NO_NODE) >> + nid = 0; >> + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, >> + 255, nid); >> + default: >> + /* These get discovered later and treated as unknown */ > > are treated? Sure, >> + return 0; >> + } >> +} > >> +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, >> + struct platform_device *pdev, >> + u32 *acpi_id) >> +{ >> + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1]; >> + bool acpi_id_valid = false; >> + struct acpi_device *buddy; >> + char uid[11]; >> + int err; >> + >> + memset(&hid, 0, sizeof(hid)); > > = {}; above and skip the memset. Sure, >> + memcpy(hid, &tbl_msc->hardware_id_linked_device, >> + sizeof(tbl_msc->hardware_id_linked_device)); >> + >> + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { >> + *acpi_id = tbl_msc->instance_id_linked_device; >> + acpi_id_valid = true; >> + } >> + >> + err = snprintf(uid, sizeof(uid), "%u", >> + tbl_msc->instance_id_linked_device); >> + if (err >= sizeof(uid)) { >> + pr_debug("Failed to convert uid of device for power management."); >> + return acpi_id_valid; >> + } >> + >> + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); >> + if (buddy) >> + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); >> + >> + return acpi_id_valid; >> +} > >> + >> +static int __init acpi_mpam_parse(void) >> +{ >> + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); >> + char *table_end, *table_offset = (char *)(table + 1); >> + struct property_entry props[4]; /* needs a sentinel */ > Perhaps move this and res into the loop and use = {}; >> + struct acpi_mpam_msc_node *tbl_msc; >> + int next_res, next_prop, err = 0; >> + struct acpi_device *companion; >> + struct platform_device *pdev; >> + enum mpam_msc_iface iface; >> + struct resource res[3]; > > Add a comment here or a check later on why this is large enough. These two are now in the loop and look like this: | /* pcc, nrdy, affinity and a sentinel */ | struct property_entry props[4] = { 0 }; | /* mmio, 2xirq, no sentinel. */ | struct resource res[3] = { 0 }; >> + char uid[16]; >> + u32 acpi_id; >> + >> + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) >> + return 0; >> + >> + if (table->revision < 1) >> + return 0; >> + >> + table_end = (char *)table + table->length; >> + >> + while (table_offset < table_end) { >> + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; >> + table_offset += tbl_msc->length; >> + >> + if (table_offset > table_end) { >> + pr_debug("MSC entry overlaps end of ACPI table\n"); >> + break; > That this isn't considered an error is a bit subtle and made me wonder > if there was a use of uninitialized pdev (there isn't because err == 0) Its somewhat a philosophical arguement. I don't expect the kernel to have to validate these tables, they're not provided by the user and there quickly becomes a point where you have to trust them, and they have to be correct. At the other extreme is the asusmption the table is line-noise and we should check everything to avoid out of bounds accesses. Dave wanted the diagnostic messages on these. As this is called from an initcall, the best you get is an inexplicable print message. (what should we say - update your firmware?) Silently failing in this code is always safe as the driver has a count of the number of MSC, and doesn't start accessing the hardware until its found them all. (this is because to find the system wide minimum value - and its not worth starting if its not possible to finish). > Why not return here? Just because there was no other return in the loop, and I hate surprise returns. I'll change it if it avoids thinking about how that platform_device_put() gets skipped! > >> + } >> + >> + /* >> + * If any of the reserved fields are set, make no attempt to >> + * parse the MSC structure. This MSC will still be counted, >> + * meaning the MPAM driver can't probe against all MSC, and >> + * will never be enabled. There is no way to enable it safely, >> + * because we cannot determine safe system-wide partid and pmg >> + * ranges in this situation. >> + */ > This is decidedly paranoid. I'd normally expect the architecture to be based > on assumption that is fine for old software to ignore new fields. ACPI itself > has fairly firm rules on this (though it goes wrong sometimes :) Yeah - the MPAM table isn't properly structured as subtables. I don't see how they are going to extend it if they need to. The paranoia is that anything set in these reserved fields probably indicates something the driver needs to know about: a case in point is the way PCC was added. I'd much prefer we skip creation of MSC devices that have properties we don't understand. acpi_mpam_count_msc() still counts them - which means the driver never finds all the MSC, and never touches the hardware. MPAM isn't a critical feature, its better that it be disabled than make things worse. (the same attitude holds with the response to the MPAM error interrupt - reset everything and pack up shop. This is bettern than accidentally combining important/unimportant tasks) > I'm guessing there is something out there that made this necessary though so > keep it if you actually need it. It's a paranoid/violent reaction to the way PCC was added - without something like this, that would have led to the OS trying to map the 0 page and poking around in it - never likely to go well. Doing this does let them pull another PCC without stable kernels going wrong. Ultimately I think they'll need to replace the table with one that is properly structured. For now - this is working with what we have. >> + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { >> + pr_err_once("Unrecognised MSC, MPAM not usable\n"); >> + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); >> + continue; >> + } >> + >> + if (!tbl_msc->mmio_size) { >> + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); >> + continue; >> + } >> + >> + if (decode_interface_type(tbl_msc, &iface)) { >> + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); >> + continue; >> + } >> + >> + next_res = 0; >> + next_prop = 0; >> + memset(res, 0, sizeof(res)); >> + memset(props, 0, sizeof(props)); >> + >> + pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier); > > https://lore.kernel.org/all/20241009124120.1124-13-shiju.jose@huawei.com/ > was a proposal to add a DEFINE_FREE() to clean these up. Might be worth a revisit. > Then Greg was against the use it was put to and asking for an example of where > it helped. Maybe this is that example. > > If you do want to do that, I'd factor out a bunch of the stuff here as a helper > so we can have the clean ownership pass of a return_ptr(). > Similar to what Shiju did here (this is the usecase for platform device that > Greg didn't like). > https://lore.kernel.org/all/20241009124120.1124-14-shiju.jose@huawei.com/ > > Even without that I think factoring some of this out and hence being able to > do returns on errors and put the if (err) into the loop would be a nice > improvement to readability. If you think its more readable I'll structure it like that. >> + if (!pdev) { >> + err = -ENOMEM; >> + break; >> + } >> + >> + if (tbl_msc->length < sizeof(*tbl_msc)) { >> + err = -EINVAL; >> + break; >> + } >> + >> + /* Some power management is described in the namespace: */ >> + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); >> + if (err > 0 && err < sizeof(uid)) { >> + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); >> + if (companion) >> + ACPI_COMPANION_SET(&pdev->dev, companion); >> + else >> + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); >> + } >> + >> + if (iface == MPAM_IFACE_MMIO) { >> + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, >> + tbl_msc->mmio_size, >> + "MPAM:MSC"); >> + } else if (iface == MPAM_IFACE_PCC) { >> + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", >> + tbl_msc->base_address); >> + next_prop++; > > Why the double increment? Needs a comment if that is right thing to do. That's a bug. I'll add some WARN_ON() when these are consumed as per your earlier suggestion. >> + } >> + >> + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); >> + err = platform_device_add_resources(pdev, res, next_res); >> + if (err) >> + break; >> + >> + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", >> + tbl_msc->max_nrdy_usec); >> + >> + /* >> + * The MSC's CPU affinity is described via its linked power >> + * management device, but only if it points at a Processor or >> + * Processor Container. >> + */ >> + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) { >> + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", >> + acpi_id); >> + } >> + >> + err = device_create_managed_software_node(&pdev->dev, props, >> + NULL); >> + if (err) >> + break; >> + >> + /* Come back later if you want the RIS too */ >> + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); >> + if (err) >> + break; >> + >> + err = platform_device_add(pdev); >> + if (err) >> + break; >> + } >> + >> + if (err) >> + platform_device_put(pdev); >> + >> + return err; >> +} >> + >> +int acpi_mpam_count_msc(void) >> +{ >> + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); >> + char *table_end, *table_offset = (char *)(table + 1); >> + struct acpi_mpam_msc_node *tbl_msc; >> + int count = 0; >> + >> + if (IS_ERR(table)) >> + return 0; >> + >> + if (table->revision < 1) >> + return 0; >> + >> + table_end = (char *)table + table->length; > Trivial so feel free to ignore. > Perhaps should aim for consistency. Whilst I prefer pointers for this stuff > PPTT did use unsigned longs. I prefer the pointers, and as it's a separate file, I don't think it needs to be concistent with PPTT. >> + >> + while (table_offset < table_end) { >> + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; >> + if (!tbl_msc->mmio_size) >> + continue; >> + >> + if (tbl_msc->length < sizeof(*tbl_msc)) >> + return -EINVAL; >> + if (tbl_msc->length > table_end - table_offset) >> + return -EINVAL; >> + table_offset += tbl_msc->length; >> + >> + count++; >> + } >> + >> + return count; >> +} >> + > Could reorder to put acpi_mpam_parse and this use of it together? mpam_msc_driver_init() calls this from subsys_initcall() to know whether its worth registering the driver at all. Even with that fixed, its still potentially racy: once the first MSC has been platform_device_add()ed, I figure the driver can probe against that, and needs to know if this first MSC was the last one. acpi_mpam_count_msc() needs to be safe to race with acpi_mpam_parse(). This could be forced far enough away in time by only registering the driver after subsys_initcall_sync() has completed - but the list of dependencies on those is ugly enough as it is. I'll add a comment: /** * acpi_mpam_count_msc() - Count the number of MSC described by firmware. * * Returns the number of of MSC, or zero for an error. * * This can be called before or in parallel with acpi_mpam_parse(). */ >> +/* >> + * Call after ACPI devices have been created, which happens behind acpi_scan_init() >> + * called from subsys_initcall(). PCC requires the mailbox driver, which is >> + * initialised from postcore_initcall(). >> + */ >> +subsys_initcall_sync(acpi_mpam_parse); >> diff --git a/include/linux/acpi.h b/include/linux/acpi.h >> index c5fd92cda487..af449964426b 100644 >> --- a/include/linux/acpi.h >> +++ b/include/linux/acpi.h >> @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); >> void acpi_table_init_complete (void); >> int acpi_table_init (void); >> >> +static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance) >> +{ >> + struct acpi_table_header *table; >> + int status = acpi_get_table(signature, instance, &table); >> + >> + if (ACPI_FAILURE(status)) >> + return ERR_PTR(-ENOENT); >> + return table; >> +} >> +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) > I'd use if (!IS_ERR_OR_NULL(_T)) not because it is functionally necessary but > because it will let the compiler optimize this out if it can tell that in a given > path _T is NULL (I think it was Peter Z who pointed this out in a similar interface > a while back). Makes sense, > I'd like an opinion from Rafael on this in general. Thanks, James
Hi James Just a few things I've picked out to reply to. Absolutely fine with your other replies. > > >> + char uid[16]; > >> + u32 acpi_id; > >> + > >> + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) > >> + return 0; > >> + > >> + if (table->revision < 1) > >> + return 0; > >> + > >> + table_end = (char *)table + table->length; > >> + > >> + while (table_offset < table_end) { > >> + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; > >> + table_offset += tbl_msc->length; > >> + > >> + if (table_offset > table_end) { > >> + pr_debug("MSC entry overlaps end of ACPI table\n"); > >> + break; > > > That this isn't considered an error is a bit subtle and made me wonder > > if there was a use of uninitialized pdev (there isn't because err == 0) > > Its somewhat a philosophical arguement. I don't expect the kernel to have to validate > these tables, they're not provided by the user and there quickly becomes a point where > you have to trust them, and they have to be correct. Potential buffer overrun is to me always worth error out screaming, but I get your broader point. Maybe just make it a pr_err() > At the other extreme is the asusmption the table is line-noise and we should check > everything to avoid out of bounds accesses. Dave wanted the diagnostic messages on these. > > As this is called from an initcall, the best you get is an inexplicable print message. > (what should we say - update your firmware?) Depends on whether you can lean hard on the firmware team. Much easier for me if I can tell them the board doesn't boot because they got it wrong. That would have been safer if we had this upstream in advance of hardware, but indeed a little high risk today as who knows what borked tables are out there. Personal preference though is to error out on things like this and handle the papering over at the top level. Don't put extra effort into checking tables are invalid but if we happen to notice as part of code safety stuff like sizes then good to scream about it. > > > Silently failing in this code is always safe as the driver has a count of the number of > MSC, and doesn't start accessing the hardware until its found them all. > (this is because to find the system wide minimum value - and its not worth starting if > its not possible to finish). > > > > Why not return here? > > Just because there was no other return in the loop, and I hate surprise returns. > > I'll change it if it avoids thinking about how that platform_device_put() gets skipped! > > > > > >> + } > >> + > >> + /* > >> + * If any of the reserved fields are set, make no attempt to > >> + * parse the MSC structure. This MSC will still be counted, > >> + * meaning the MPAM driver can't probe against all MSC, and > >> + * will never be enabled. There is no way to enable it safely, > >> + * because we cannot determine safe system-wide partid and pmg > >> + * ranges in this situation. > >> + */ > > > This is decidedly paranoid. I'd normally expect the architecture to be based > > on assumption that is fine for old software to ignore new fields. ACPI itself > > has fairly firm rules on this (though it goes wrong sometimes :) > > Yeah - the MPAM table isn't properly structured as subtables. I don't see how they are > going to extend it if they need to. > > The paranoia is that anything set in these reserved fields probably indicates something > the driver needs to know about: a case in point is the way PCC was added. > > I'd much prefer we skip creation of MSC devices that have properties we don't understand. > acpi_mpam_count_msc() still counts them - which means the driver never finds all the MSC, > and never touches the hardware. > > MPAM isn't a critical feature, its better that it be disabled than make things worse. > (the same attitude holds with the response to the MPAM error interrupt - reset everything > and pack up shop. This is bettern than accidentally combining important/unimportant > tasks) > > > > I'm guessing there is something out there that made this necessary though so > > keep it if you actually need it. > > It's a paranoid/violent reaction to the way PCC was added - without something like this, > that would have led to the OS trying to map the 0 page and poking around in it - never > likely to go well. > > Doing this does let them pull another PCC without stable kernels going wrong. > Ultimately I think they'll need to replace the table with one that is properly structured. > For now - this is working with what we have. Fair enough. I'm too lazy / behind with reviews to go scream via our channels about problems here. Paranoia it is. Maybe we'll end up backporting some 'fixes' that ignore nicely added fields with appropriate control bits to turn them on. So be it if that happens. > > > >> + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { > >> + pr_err_once("Unrecognised MSC, MPAM not usable\n"); > >> + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); > >> + continue; > >> + } > >> + > >> + if (!tbl_msc->mmio_size) { > >> + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); > >> + continue; > >> + } > >> + > >> + if (decode_interface_type(tbl_msc, &iface)) { > >> + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); > >> + continue; > >> + } > >> + > >> + next_res = 0; > >> + next_prop = 0; > >> + memset(res, 0, sizeof(res)); > >> + memset(props, 0, sizeof(props)); > >> + > >> + pdev = platform_device_alloc("mpam_msc", tbl_msc->identifier); > > > > https://lore.kernel.org/all/20241009124120.1124-13-shiju.jose@huawei.com/ > > was a proposal to add a DEFINE_FREE() to clean these up. Might be worth a revisit. > > Then Greg was against the use it was put to and asking for an example of where > > it helped. Maybe this is that example. > > > > If you do want to do that, I'd factor out a bunch of the stuff here as a helper > > so we can have the clean ownership pass of a return_ptr(). > > Similar to what Shiju did here (this is the usecase for platform device that > > Greg didn't like). > > https://lore.kernel.org/all/20241009124120.1124-14-shiju.jose@huawei.com/ > > > > Even without that I think factoring some of this out and hence being able to > > do returns on errors and put the if (err) into the loop would be a nice > > improvement to readability. > > If you think its more readable I'll structure it like that. The refactor yes. I'd keep clear of the the DEFINE_FREE() unless you have some spare time ;) > > > >> + if (!pdev) { > >> + err = -ENOMEM; > >> + break; > >> + } > >> +int acpi_mpam_count_msc(void) > >> +{ > >> + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); > >> + char *table_end, *table_offset = (char *)(table + 1); > >> + struct acpi_mpam_msc_node *tbl_msc; > >> + int count = 0; > >> + > >> + if (IS_ERR(table)) > >> + return 0; > >> + > >> + if (table->revision < 1) > >> + return 0; > >> + > >> + table_end = (char *)table + table->length; > > > Trivial so feel free to ignore. > > Perhaps should aim for consistency. Whilst I prefer pointers for this stuff > > PPTT did use unsigned longs. > > I prefer the pointers, and as it's a separate file, I don't think it needs to be > concistent with PPTT. Fair enough. Maybe PPTT is ripe for some cleanup once you are done messing with it. I'm certainly going to add churn now. J
After updating the monitor configuration, the first read of the monitoring
result requires waiting for the "not ready" duration before an effective
value can be obtained.
Because a component consists of multiple MPAM instances, after updating the
configuration of each instance, should wait for the "not ready" period of
per single instance before the valid monitoring value can be obtained, not
just wait for once interval per component.
Signed-off-by: Zeng Heng <zengheng4@huawei.com>
---
It's fine to merge this patch directly into patch 7 of the responding
patchset.
---
drivers/resctrl/mpam_devices.c | 36 +++++++++++++++-------------------
1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 2962cd018207..e79a46646863 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -1183,11 +1183,14 @@ static void __ris_msmon_read(void *arg)
}
*m->val += now;
+ m->err = 0;
}
static int _msmon_read(struct mpam_component *comp, struct mon_read *arg)
{
int err, idx;
+ bool read_again;
+ u64 wait_jiffies;
struct mpam_msc *msc;
struct mpam_vmsc *vmsc;
struct mpam_msc_ris *ris;
@@ -1198,10 +1201,22 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg)
list_for_each_entry_rcu(ris, &vmsc->ris, vmsc_list) {
arg->ris = ris;
+ read_again = false;
+again:
err = smp_call_function_any(&msc->accessibility,
__ris_msmon_read, arg,
true);
+ if (arg->err == -EBUSY && !read_again) {
+ read_again = true;
+
+ wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec);
+ while (wait_jiffies)
+ wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies);
+
+ goto again;
+ }
+
if (!err && arg->err)
err = arg->err;
if (err)
@@ -1218,9 +1233,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg)
int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
enum mpam_device_features type, u64 *val)
{
- int err;
struct mon_read arg;
- u64 wait_jiffies = 0;
struct mpam_props *cprops = &comp->class->props;
might_sleep();
@@ -1237,24 +1250,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
arg.val = val;
*val = 0;
- err = _msmon_read(comp, &arg);
- if (err == -EBUSY && comp->class->nrdy_usec)
- wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec);
-
- while (wait_jiffies)
- wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies);
-
- if (err == -EBUSY) {
- memset(&arg, 0, sizeof(arg));
- arg.ctx = ctx;
- arg.type = type;
- arg.val = val;
- *val = 0;
-
- err = _msmon_read(comp, &arg);
- }
-
- return err;
+ return _msmon_read(comp, &arg);
}
void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx)
--
2.25.1
Hi Zeng, On 16/09/2025 14:17, Zeng Heng wrote: > After updating the monitor configuration, the first read of the monitoring > result requires waiting for the "not ready" duration before an effective > value can be obtained. May need to wait - some platforms need to do this, some don't. Yours is the first I've heard of that does this! > Because a component consists of multiple MPAM instances, after updating the > configuration of each instance, should wait for the "not ready" period of > per single instance before the valid monitoring value can be obtained, not > just wait for once interval per component. I'm really trying to avoid that ... if you have ~200 MSC pretending to be one thing, you'd wait 200x the maximum period. On systems with CMN, the number of MSC scales with the number of CPUs, so 200x isn't totally crazy. I think the real problem here is the driver doesn't go on to reconfigure MSC-2 if MSC-1 returned not-ready, meaning the "I'll only wait once" logic kicks in and returns not-ready to the user. (which is presumably what you're seeing?) Does this solve your problem?: -----------------%<----------------- diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 404bd4c1fd5e..2f39d0339349 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1395,7 +1395,7 @@ static void __ris_msmon_read(void *arg) static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) { - int err, idx; + int err, any_err = 0, idx; struct mpam_msc *msc; struct mpam_vmsc *vmsc; struct mpam_msc_ris *ris; @@ -1412,15 +1412,19 @@ static int _msmon_read(struct mpam_component *comp, stru ct mon_read *arg) true); if (!err && arg->err) err = arg->err; + + /* + * Save one error to be returned to the caller, but + * keep reading counters so that the get reprogrammed. + * On platforms with NRDY this lets us wait once. + */ if (err) - break; + any_err = err; } - if (err) - break; } srcu_read_unlock(&mpam_srcu, idx); - return err; + return any_err; } int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, -----------------%<----------------- Thanks, James
© 2016 - 2025 Red Hat, Inc.