[v1] arm_mpam: Add basic mpam driver

[PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months, 2 weeks ago

Probing MPAM is convoluted. MSCs that are integrated with a CPU may
only be accessible from those CPUs, and they may not be online.
Touching the hardware early is pointless as MPAM can't be used until
the system-wide common values for num_partid and num_pmg have been
discovered.

Start with driver probe/remove and mapping the MSC.

CC: Carl Worth <carl@os.amperecomputing.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
Changes since RFC:
 * Check for status=broken DT devices.
 * Moved all the files around.
 * Made Kconfig symbols depend on EXPERT
---
 arch/arm64/Kconfig              |   1 +
 drivers/Kconfig                 |   2 +
 drivers/Makefile                |   1 +
 drivers/resctrl/Kconfig         |  11 ++
 drivers/resctrl/Makefile        |   4 +
 drivers/resctrl/mpam_devices.c  | 336 ++++++++++++++++++++++++++++++++
 drivers/resctrl/mpam_internal.h |  62 ++++++
 7 files changed, 417 insertions(+)
 create mode 100644 drivers/resctrl/Kconfig
 create mode 100644 drivers/resctrl/Makefile
 create mode 100644 drivers/resctrl/mpam_devices.c
 create mode 100644 drivers/resctrl/mpam_internal.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e51ccf1da102..ea3c54e04275 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE
 
 config ARM64_MPAM
 	bool "Enable support for MPAM"
+	select ARM64_MPAM_DRIVER
 	select ACPI_MPAM if ACPI
 	help
 	  Memory Partitioning and Monitoring is an optional extension
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 4915a63866b0..3054b50a2f4c 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
 
 source "drivers/cdx/Kconfig"
 
+source "drivers/resctrl/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index b5749cf67044..f41cf4eddeba 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -194,5 +194,6 @@ obj-$(CONFIG_HTE)		+= hte/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
 obj-$(CONFIG_CDX_BUS)		+= cdx/
 obj-$(CONFIG_DPLL)		+= dpll/
+obj-y				+= resctrl/
 
 obj-$(CONFIG_S390)		+= s390/
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
new file mode 100644
index 000000000000..dff7b87280ab
--- /dev/null
+++ b/drivers/resctrl/Kconfig
@@ -0,0 +1,11 @@
+# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
+# CPU resources, not containers or cgroups etc.
+config ARM64_MPAM_DRIVER
+	bool "MPAM driver for System IP, e,g. caches and memory controllers"
+	depends on ARM64_MPAM && EXPERT
+
+config ARM64_MPAM_DRIVER_DEBUG
+	bool "Enable debug messages from the MPAM driver."
+	depends on ARM64_MPAM_DRIVER
+	help
+	  Say yes here to enable debug messages from the MPAM driver.
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
new file mode 100644
index 000000000000..92b48fa20108
--- /dev/null
+++ b/drivers/resctrl/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
+mpam-y						+= mpam_devices.o
+
+cflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
new file mode 100644
index 000000000000..a0d9a699a6e7
--- /dev/null
+++ b/drivers/resctrl/mpam_devices.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/acpi.h>
+#include <linux/arm_mpam.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+#include <acpi/pcc.h>
+
+#include "mpam_internal.h"
+
+/*
+ * mpam_list_lock protects the SRCU lists when writing. Once the
+ * mpam_enabled key is enabled these lists are read-only,
+ * unless the error interrupt disables the driver.
+ */
+static DEFINE_MUTEX(mpam_list_lock);
+static LIST_HEAD(mpam_all_msc);
+
+static struct srcu_struct mpam_srcu;
+
+/* MPAM isn't available until all the MSC have been probed. */
+static u32 mpam_num_msc;
+
+static void mpam_discovery_complete(void)
+{
+	pr_err("Discovered all MSC\n");
+}
+
+static int mpam_dt_count_msc(void)
+{
+	int count = 0;
+	struct device_node *np;
+
+	for_each_compatible_node(np, NULL, "arm,mpam-msc") {
+		if (of_device_is_available(np))
+			count++;
+	}
+
+	return count;
+}
+
+static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
+				  u32 ris_idx)
+{
+	int err = 0;
+	u32 level = 0;
+	unsigned long cache_id;
+	struct device_node *cache;
+
+	do {
+		if (of_device_is_compatible(np, "arm,mpam-cache")) {
+			cache = of_parse_phandle(np, "arm,mpam-device", 0);
+			if (!cache) {
+				pr_err("Failed to read phandle\n");
+				break;
+			}
+		} else if (of_device_is_compatible(np->parent, "cache")) {
+			cache = of_node_get(np->parent);
+		} else {
+			/* For now, only caches are supported */
+			cache = NULL;
+			break;
+		}
+
+		err = of_property_read_u32(cache, "cache-level", &level);
+		if (err) {
+			pr_err("Failed to read cache-level\n");
+			break;
+		}
+
+		cache_id = cache_of_calculate_id(cache);
+		if (cache_id == ~0UL) {
+			err = -ENOENT;
+			break;
+		}
+
+		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
+				      cache_id);
+	} while (0);
+	of_node_put(cache);
+
+	return err;
+}
+
+static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
+{
+	int err, num_ris = 0;
+	const u32 *ris_idx_p;
+	struct device_node *iter, *np;
+
+	np = msc->pdev->dev.of_node;
+	for_each_child_of_node(np, iter) {
+		ris_idx_p = of_get_property(iter, "reg", NULL);
+		if (ris_idx_p) {
+			num_ris++;
+			err = mpam_dt_parse_resource(msc, iter, *ris_idx_p);
+			if (err) {
+				of_node_put(iter);
+				return err;
+			}
+		}
+	}
+
+	if (!num_ris)
+		mpam_dt_parse_resource(msc, np, 0);
+
+	return err;
+}
+
+/*
+ * An MSC can control traffic from a set of CPUs, but may only be accessible
+ * from a (hopefully wider) set of CPUs. The common reason for this is power
+ * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
+ * the corresponding cache may also be powered off. By making accesses from
+ * one of those CPUs, we ensure this isn't the case.
+ */
+static int update_msc_accessibility(struct mpam_msc *msc)
+{
+	struct device_node *parent;
+	u32 affinity_id;
+	int err;
+
+	if (!acpi_disabled) {
+		err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
+					       &affinity_id);
+		if (err)
+			cpumask_copy(&msc->accessibility, cpu_possible_mask);
+		else
+			acpi_pptt_get_cpus_from_container(affinity_id,
+							  &msc->accessibility);
+
+		return 0;
+	}
+
+	/* This depends on the path to of_node */
+	parent = of_get_parent(msc->pdev->dev.of_node);
+	if (parent == of_root) {
+		cpumask_copy(&msc->accessibility, cpu_possible_mask);
+		err = 0;
+	} else {
+		err = -EINVAL;
+		pr_err("Cannot determine accessibility of MSC: %s\n",
+		       dev_name(&msc->pdev->dev));
+	}
+	of_node_put(parent);
+
+	return err;
+}
+
+static int fw_num_msc;
+
+static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
+{
+	/* TODO: wake up tasks blocked on this MSC's PCC channel */
+}
+
+static void mpam_msc_drv_remove(struct platform_device *pdev)
+{
+	struct mpam_msc *msc = platform_get_drvdata(pdev);
+
+	if (!msc)
+		return;
+
+	mutex_lock(&mpam_list_lock);
+	mpam_num_msc--;
+	platform_set_drvdata(pdev, NULL);
+	list_del_rcu(&msc->glbl_list);
+	synchronize_srcu(&mpam_srcu);
+	devm_kfree(&pdev->dev, msc);
+	mutex_unlock(&mpam_list_lock);
+}
+
+static int mpam_msc_drv_probe(struct platform_device *pdev)
+{
+	int err;
+	struct mpam_msc *msc;
+	struct resource *msc_res;
+	void *plat_data = pdev->dev.platform_data;
+
+	mutex_lock(&mpam_list_lock);
+	do {
+		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
+		if (!msc) {
+			err = -ENOMEM;
+			break;
+		}
+
+		mutex_init(&msc->probe_lock);
+		mutex_init(&msc->part_sel_lock);
+		mutex_init(&msc->outer_mon_sel_lock);
+		raw_spin_lock_init(&msc->inner_mon_sel_lock);
+		msc->id = mpam_num_msc++;
+		msc->pdev = pdev;
+		INIT_LIST_HEAD_RCU(&msc->glbl_list);
+		INIT_LIST_HEAD_RCU(&msc->ris);
+
+		err = update_msc_accessibility(msc);
+		if (err)
+			break;
+		if (cpumask_empty(&msc->accessibility)) {
+			pr_err_once("msc:%u is not accessible from any CPU!",
+				    msc->id);
+			err = -EINVAL;
+			break;
+		}
+
+		if (device_property_read_u32(&pdev->dev, "pcc-channel",
+					     &msc->pcc_subspace_id))
+			msc->iface = MPAM_IFACE_MMIO;
+		else
+			msc->iface = MPAM_IFACE_PCC;
+
+		if (msc->iface == MPAM_IFACE_MMIO) {
+			void __iomem *io;
+
+			io = devm_platform_get_and_ioremap_resource(pdev, 0,
+								    &msc_res);
+			if (IS_ERR(io)) {
+				pr_err("Failed to map MSC base address\n");
+				err = PTR_ERR(io);
+				break;
+			}
+			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
+			msc->mapped_hwpage = io;
+		} else if (msc->iface == MPAM_IFACE_PCC) {
+			msc->pcc_cl.dev = &pdev->dev;
+			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
+			msc->pcc_cl.tx_block = false;
+			msc->pcc_cl.tx_tout = 1000; /* 1s */
+			msc->pcc_cl.knows_txdone = false;
+
+			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
+								 msc->pcc_subspace_id);
+			if (IS_ERR(msc->pcc_chan)) {
+				pr_err("Failed to request MSC PCC channel\n");
+				err = PTR_ERR(msc->pcc_chan);
+				break;
+			}
+		}
+
+		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
+		platform_set_drvdata(pdev, msc);
+	} while (0);
+	mutex_unlock(&mpam_list_lock);
+
+	if (!err) {
+		/* Create RIS entries described by firmware */
+		if (!acpi_disabled)
+			err = acpi_mpam_parse_resources(msc, plat_data);
+		else
+			err = mpam_dt_parse_resources(msc, plat_data);
+	}
+
+	if (!err && fw_num_msc == mpam_num_msc)
+		mpam_discovery_complete();
+
+	if (err && msc)
+		mpam_msc_drv_remove(pdev);
+
+	return err;
+}
+
+static const struct of_device_id mpam_of_match[] = {
+	{ .compatible = "arm,mpam-msc", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, mpam_of_match);
+
+static struct platform_driver mpam_msc_driver = {
+	.driver = {
+		.name = "mpam_msc",
+		.of_match_table = of_match_ptr(mpam_of_match),
+	},
+	.probe = mpam_msc_drv_probe,
+	.remove = mpam_msc_drv_remove,
+};
+
+/*
+ * MSC that are hidden under caches are not created as platform devices
+ * as there is no cache driver. Caches are also special-cased in
+ * update_msc_accessibility().
+ */
+static void mpam_dt_create_foundling_msc(void)
+{
+	int err;
+	struct device_node *cache;
+
+	for_each_compatible_node(cache, NULL, "cache") {
+		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
+		if (err)
+			pr_err("Failed to create MSC devices under caches\n");
+	}
+}
+
+static int __init mpam_msc_driver_init(void)
+{
+	if (!system_supports_mpam())
+		return -EOPNOTSUPP;
+
+	init_srcu_struct(&mpam_srcu);
+
+	if (!acpi_disabled)
+		fw_num_msc = acpi_mpam_count_msc();
+	else
+		fw_num_msc = mpam_dt_count_msc();
+
+	if (fw_num_msc <= 0) {
+		pr_err("No MSC devices found in firmware\n");
+		return -EINVAL;
+	}
+
+	if (acpi_disabled)
+		mpam_dt_create_foundling_msc();
+
+	return platform_driver_register(&mpam_msc_driver);
+}
+subsys_initcall(mpam_msc_driver_init);
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
new file mode 100644
index 000000000000..07e0f240eaca
--- /dev/null
+++ b/drivers/resctrl/mpam_internal.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (C) 2024 Arm Ltd.
+
+#ifndef MPAM_INTERNAL_H
+#define MPAM_INTERNAL_H
+
+#include <linux/arm_mpam.h>
+#include <linux/cpumask.h>
+#include <linux/io.h>
+#include <linux/mailbox_client.h>
+#include <linux/mutex.h>
+#include <linux/resctrl.h>
+#include <linux/sizes.h>
+
+struct mpam_msc {
+	/* member of mpam_all_msc */
+	struct list_head        glbl_list;
+
+	int			id;
+	struct platform_device *pdev;
+
+	/* Not modified after mpam_is_enabled() becomes true */
+	enum mpam_msc_iface	iface;
+	u32			pcc_subspace_id;
+	struct mbox_client	pcc_cl;
+	struct pcc_mbox_chan	*pcc_chan;
+	u32			nrdy_usec;
+	cpumask_t		accessibility;
+
+	/*
+	 * probe_lock is only take during discovery. After discovery these
+	 * properties become read-only and the lists are protected by SRCU.
+	 */
+	struct mutex		probe_lock;
+	unsigned long		ris_idxs[128 / BITS_PER_LONG];
+	u32			ris_max;
+
+	/* mpam_msc_ris of this component */
+	struct list_head	ris;
+
+	/*
+	 * part_sel_lock protects access to the MSC hardware registers that are
+	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
+	 * by RIS).
+	 * If needed, take msc->lock first.
+	 */
+	struct mutex		part_sel_lock;
+
+	/*
+	 * mon_sel_lock protects access to the MSC hardware registers that are
+	 * affeted by MPAMCFG_MON_SEL.
+	 * If needed, take msc->lock first.
+	 */
+	struct mutex		outer_mon_sel_lock;
+	raw_spinlock_t		inner_mon_sel_lock;
+	unsigned long		inner_mon_sel_flags;
+
+	void __iomem		*mapped_hwpage;
+	size_t			mapped_hwpage_sz;
+};
+
+#endif /* MPAM_INTERNAL_H */
-- 
2.20.1

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Dave Martin 5 months, 1 week ago

Hi James,

On Fri, Aug 22, 2025 at 03:29:51PM +0000, James Morse wrote:
> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> only be accessible from those CPUs, and they may not be online.
> Touching the hardware early is pointless as MPAM can't be used until
> the system-wide common values for num_partid and num_pmg have been
> discovered.
> 
> Start with driver probe/remove and mapping the MSC.
> 
> CC: Carl Worth <carl@os.amperecomputing.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since RFC:
>  * Check for status=broken DT devices.
>  * Moved all the files around.
>  * Made Kconfig symbols depend on EXPERT
> ---

[...]

> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
> new file mode 100644
> index 000000000000..dff7b87280ab
> --- /dev/null
> +++ b/drivers/resctrl/Kconfig
> @@ -0,0 +1,11 @@
> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
> +# CPU resources, not containers or cgroups etc.

Drop confusing comment?

CPUs are not mentioned other than in the comment -- I think the
descriptions are sufficiently self-explanatory that they don't read
onto CPUs.

> +config ARM64_MPAM_DRIVER
> +	bool "MPAM driver for System IP, e,g. caches and memory controllers"
> +	depends on ARM64_MPAM && EXPERT
> +
> +config ARM64_MPAM_DRIVER_DEBUG
> +	bool "Enable debug messages from the MPAM driver."

Nit: spurious full stop.

(i.e., people don't add one in these one-line descriptions.
They are title-like and self-delimiting, even when the text is a valid
sentence.)

> +	depends on ARM64_MPAM_DRIVER
> +	help
> +	  Say yes here to enable debug messages from the MPAM driver.
> diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
> new file mode 100644
> index 000000000000..92b48fa20108
> --- /dev/null
> +++ b/drivers/resctrl/Makefile
> @@ -0,0 +1,4 @@
> +obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
> +mpam-y						+= mpam_devices.o
> +
> +cflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> new file mode 100644
> index 000000000000..a0d9a699a6e7
> --- /dev/null
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2025 Arm Ltd.
> +
> +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
> +
> +#include <linux/acpi.h>
> +#include <linux/arm_mpam.h>
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/gfp.h>
> +#include <linux/list.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/srcu.h>
> +#include <linux/types.h>
> +
> +#include <acpi/pcc.h>
> +
> +#include "mpam_internal.h"
> +
> +/*
> + * mpam_list_lock protects the SRCU lists when writing. Once the
> + * mpam_enabled key is enabled these lists are read-only,
> + * unless the error interrupt disables the driver.
> + */
> +static DEFINE_MUTEX(mpam_list_lock);
> +static LIST_HEAD(mpam_all_msc);
> +
> +static struct srcu_struct mpam_srcu;
> +
> +/* MPAM isn't available until all the MSC have been probed. */

Comment doesn't really explain the variable.

Maybe something like "Number of MSCs that need to be probed for MPAM
to be usable" ?

> +static u32 mpam_num_msc;

Any particular reason this is u32 and not unsigned int?

How are accesses to this protected against data races?

If there are supposed to be locks to protect globals in the MPAM driver,
is it worth wrapping them in access functions with a lockdep assert?
Otherwise, it feels rather easy to get this wrong -- I think I've found
at least one bug (see mpam_msc_drv_probe().)

> +
> +static void mpam_discovery_complete(void)
> +{
> +	pr_err("Discovered all MSC\n");
> +}

As others have commented, if this is non-functional code that gets
removed later on, it's probably best to drop this up-front?


[...]

> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
> +				  u32 ris_idx)
> +{
> +	int err = 0;
> +	u32 level = 0;
> +	unsigned long cache_id;
> +	struct device_node *cache;
> +
> +	do {
> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
> +			if (!cache) {
> +				pr_err("Failed to read phandle\n");
> +				break;
> +			}
> +		} else if (of_device_is_compatible(np->parent, "cache")) {
> +			cache = of_node_get(np->parent);
> +		} else {
> +			/* For now, only caches are supported */
> +			cache = NULL;
> +			break;
> +		}
> +
> +		err = of_property_read_u32(cache, "cache-level", &level);
> +		if (err) {
> +			pr_err("Failed to read cache-level\n");
> +			break;
> +		}
> +
> +		cache_id = cache_of_calculate_id(cache);
> +		if (cache_id == ~0UL) {

The type of cache_id may change if the return type of
cache_of_calculate_id() changes (see comments on patch 1).

Possible #define for the exceptional value.

> +			err = -ENOENT;
> +			break;

The lack of a diagnostic here is inconsistent with the level of
diagnostics in the rest of the loop.

> +		}
> +
> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
> +				      cache_id);
> +	} while (0);

Abuse of do ... while () here?

There is no loop.  The breaks are stealth "goto"s to this statement:

> +	of_node_put(cache);

(It works either way, but maybe gotos to an explicit label would be
more readable, as well as avoiding an unnecessary level of indentation.)

> +
> +	return err;
> +}

[...]

> +/*
> + * An MSC can control traffic from a set of CPUs, but may only be accessible
> + * from a (hopefully wider) set of CPUs. The common reason for this is power
> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
> + * the corresponding cache may also be powered off. By making accesses from

Nit: the the

> + * one of those CPUs, we ensure this isn't the case.
> + */
> +static int update_msc_accessibility(struct mpam_msc *msc)
> +{
> +	struct device_node *parent;
> +	u32 affinity_id;
> +	int err;
> +
> +	if (!acpi_disabled) {
> +		err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
> +					       &affinity_id);
> +		if (err)
> +			cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		else
> +			acpi_pptt_get_cpus_from_container(affinity_id,
> +							  &msc->accessibility);
> +
> +		return 0;
> +	}
> +
> +	/* This depends on the path to of_node */
> +	parent = of_get_parent(msc->pdev->dev.of_node);
> +	if (parent == of_root) {
> +		cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		err = 0;
> +	} else {
> +		err = -EINVAL;
> +		pr_err("Cannot determine accessibility of MSC: %s\n",
> +		       dev_name(&msc->pdev->dev));
> +	}
> +	of_node_put(parent);
> +
> +	return err;
> +}
> +
> +static int fw_num_msc;

Does this need to be protected against data races?

If individual mpam_msc_drv_probe() calls may execute on different CPUs
from mpam_msc_driver_init(), then seem to be potential races here.

> +
> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
> +{
> +	/* TODO: wake up tasks blocked on this MSC's PCC channel */

So, is this broken in this commit?

(If the series does not get broken up or applied piecemail, that's not
such a concern, though.)

> +}
> +
> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> +{

The MPAM driver cannot currently be built as a module.

Is it possible to exercise the driver remove paths, today?

> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
> +
> +	if (!msc)
> +		return;
> +
> +	mutex_lock(&mpam_list_lock);
> +	mpam_num_msc--;
> +	platform_set_drvdata(pdev, NULL);
> +	list_del_rcu(&msc->glbl_list);
> +	synchronize_srcu(&mpam_srcu);
> +	devm_kfree(&pdev->dev, msc);
> +	mutex_unlock(&mpam_list_lock);
> +}
> +
> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> +{
> +	int err;
> +	struct mpam_msc *msc;
> +	struct resource *msc_res;
> +	void *plat_data = pdev->dev.platform_data;
> +
> +	mutex_lock(&mpam_list_lock);
> +	do {
> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
> +		if (!msc) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +
> +		mutex_init(&msc->probe_lock);
> +		mutex_init(&msc->part_sel_lock);
> +		mutex_init(&msc->outer_mon_sel_lock);
> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
> +		msc->id = mpam_num_msc++;
> +		msc->pdev = pdev;
> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);
> +		INIT_LIST_HEAD_RCU(&msc->ris);
> +
> +		err = update_msc_accessibility(msc);
> +		if (err)
> +			break;
> +		if (cpumask_empty(&msc->accessibility)) {
> +			pr_err_once("msc:%u is not accessible from any CPU!",
> +				    msc->id);
> +			err = -EINVAL;
> +			break;
> +		}
> +
> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
> +					     &msc->pcc_subspace_id))
> +			msc->iface = MPAM_IFACE_MMIO;
> +		else
> +			msc->iface = MPAM_IFACE_PCC;
> +
> +		if (msc->iface == MPAM_IFACE_MMIO) {
> +			void __iomem *io;
> +
> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
> +								    &msc_res);
> +			if (IS_ERR(io)) {
> +				pr_err("Failed to map MSC base address\n");
> +				err = PTR_ERR(io);
> +				break;
> +			}
> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
> +			msc->mapped_hwpage = io;
> +		} else if (msc->iface == MPAM_IFACE_PCC) {
> +			msc->pcc_cl.dev = &pdev->dev;
> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
> +			msc->pcc_cl.tx_block = false;
> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
> +			msc->pcc_cl.knows_txdone = false;
> +
> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
> +								 msc->pcc_subspace_id);
> +			if (IS_ERR(msc->pcc_chan)) {
> +				pr_err("Failed to request MSC PCC channel\n");
> +				err = PTR_ERR(msc->pcc_chan);
> +				break;
> +			}
> +		}

Should the lock be held across initialisation of the msc fields?

list_add_rcu() might imply sufficient barriers to ensure that the
initialisations are visible to other threads that obtain the msc
pointer by iterating over mpam_all_msc.

It's probably cleaner to hold the lock explicitly, though.

What other ways of obtaining the msc pointer exist?


> +
> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
> +		platform_set_drvdata(pdev, msc);
> +	} while (0);
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (!err) {
> +		/* Create RIS entries described by firmware */
> +		if (!acpi_disabled)
> +			err = acpi_mpam_parse_resources(msc, plat_data);
> +		else
> +			err = mpam_dt_parse_resources(msc, plat_data);
> +	}
> +
> +	if (!err && fw_num_msc == mpam_num_msc)

Unlocked read of mpam_num_msc?

> +		mpam_discovery_complete();
> +
> +	if (err && msc)
> +		mpam_msc_drv_remove(pdev);
> +
> +	return err;
> +}
> +
> +static const struct of_device_id mpam_of_match[] = {
> +	{ .compatible = "arm,mpam-msc", },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, mpam_of_match);
> +
> +static struct platform_driver mpam_msc_driver = {
> +	.driver = {
> +		.name = "mpam_msc",
> +		.of_match_table = of_match_ptr(mpam_of_match),
> +	},
> +	.probe = mpam_msc_drv_probe,
> +	.remove = mpam_msc_drv_remove,
> +};
> +
> +/*
> + * MSC that are hidden under caches are not created as platform devices
> + * as there is no cache driver. Caches are also special-cased in
> + * update_msc_accessibility().
> + */

Can you elaborate?  I don't understand quite what this is doing.

> +static void mpam_dt_create_foundling_msc(void)
> +{
> +	int err;
> +	struct device_node *cache;
> +
> +	for_each_compatible_node(cache, NULL, "cache") {
> +		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
> +		if (err)
> +			pr_err("Failed to create MSC devices under caches\n");
> +	}
> +}

[...]

> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> new file mode 100644
> index 000000000000..07e0f240eaca
> --- /dev/null
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +// Copyright (C) 2024 Arm Ltd.
> +
> +#ifndef MPAM_INTERNAL_H
> +#define MPAM_INTERNAL_H
> +
> +#include <linux/arm_mpam.h>
> +#include <linux/cpumask.h>
> +#include <linux/io.h>
> +#include <linux/mailbox_client.h>
> +#include <linux/mutex.h>
> +#include <linux/resctrl.h>
> +#include <linux/sizes.h>
> +
> +struct mpam_msc {
> +	/* member of mpam_all_msc */
> +	struct list_head        glbl_list;

It is worth making these names less mismatched?

> +
> +	int			id;
> +	struct platform_device *pdev;
> +
> +	/* Not modified after mpam_is_enabled() becomes true */
> +	enum mpam_msc_iface	iface;
> +	u32			pcc_subspace_id;
> +	struct mbox_client	pcc_cl;
> +	struct pcc_mbox_chan	*pcc_chan;
> +	u32			nrdy_usec;
> +	cpumask_t		accessibility;
> +
> +	/*
> +	 * probe_lock is only take during discovery. After discovery these
> +	 * properties become read-only and the lists are protected by SRCU.
> +	 */
> +	struct mutex		probe_lock;

Can we have more clarify about the locking strategy, including details
of which things each lock is supposed to apply to and when, and how (if
at all) the locks are intended to nest?

(Similarly for the global locks.)

> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];
> +	u32			ris_max;

nrdy_usec, ris_idxs and ris_max appear unused in this patch (though I
suppose they get initialised by virtue of kzalloc()).  Is this
intentional?

> +
> +	/* mpam_msc_ris of this component */
> +	struct list_head	ris;
> +
> +	/*
> +	 * part_sel_lock protects access to the MSC hardware registers that are
> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
> +	 * by RIS).
> +	 * If needed, take msc->lock first.
> +	 */

What's msc->lock ?

> +	struct mutex		part_sel_lock;
> +
> +	/*
> +	 * mon_sel_lock protects access to the MSC hardware registers that are
> +	 * affeted by MPAMCFG_MON_SEL.
> +	 * If needed, take msc->lock first.
> +	 */

Same here.

> +	struct mutex		outer_mon_sel_lock;
> +	raw_spinlock_t		inner_mon_sel_lock;
> +	unsigned long		inner_mon_sel_flags;
> +
> +	void __iomem		*mapped_hwpage;
> +	size_t			mapped_hwpage_sz;
> +};
> +
> +#endif /* MPAM_INTERNAL_H */

[...]

Cheers
---Dave

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Dave,

On 01/09/2025 12:21, Dave Martin wrote:
> On Fri, Aug 22, 2025 at 03:29:51PM +0000, James Morse wrote:
>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>> only be accessible from those CPUs, and they may not be online.
>> Touching the hardware early is pointless as MPAM can't be used until
>> the system-wide common values for num_partid and num_pmg have been
>> discovered.
>>
>> Start with driver probe/remove and mapping the MSC.
>> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
>> new file mode 100644
>> index 000000000000..dff7b87280ab
>> --- /dev/null
>> +++ b/drivers/resctrl/Kconfig
>> @@ -0,0 +1,11 @@
>> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
>> +# CPU resources, not containers or cgroups etc.
>
> Drop confusing comment?
>
> CPUs are not mentioned other than in the comment -- I think the
> descriptions are sufficiently self-explanatory that they don't read
> onto CPUs.

This used to add ARM_CPU_RESCTRL, to mirror X86_CPU_RESCTRL.
It's been tidied up since then, but the comment remains.
I'll remove it.



>> +config ARM64_MPAM_DRIVER
>> +	bool "MPAM driver for System IP, e,g. caches and memory controllers"
>> +	depends on ARM64_MPAM && EXPERT
>> +
>> +config ARM64_MPAM_DRIVER_DEBUG
>> +	bool "Enable debug messages from the MPAM driver."
>
> Nit: spurious full stop.
>
> (i.e., people don't add one in these one-line descriptions.
> They are title-like and self-delimiting, even when the text is a valid
> sentence.)

/me waves hands around


>> +	depends on ARM64_MPAM_DRIVER
>> +	help
>> +	  Say yes here to enable debug messages from the MPAM driver.

>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>> new file mode 100644
>> index 000000000000..a0d9a699a6e7
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_devices.c
>> @@ -0,0 +1,336 @@

>> +/*
>> + * mpam_list_lock protects the SRCU lists when writing. Once the
>> + * mpam_enabled key is enabled these lists are read-only,
>> + * unless the error interrupt disables the driver.
>> + */
>> +static DEFINE_MUTEX(mpam_list_lock);
>> +static LIST_HEAD(mpam_all_msc);
>> +
>> +static struct srcu_struct mpam_srcu;
>> +
>> +/* MPAM isn't available until all the MSC have been probed. */
>
> Comment doesn't really explain the variable.
>
> Maybe something like "Number of MSCs that need to be probed for MPAM
> to be usable" ?

Its the count not the remainder. I went with:
| * Number of MSCs that have been probed. Once all MSC have been probed MPAM
| * can be enabled.


>> +static u32 mpam_num_msc;
>
> Any particular reason this is u32 and not unsigned int?
u32 is less typing!


> How are accesses to this protected against data races?

It's under the list-lock, but after Rob's feedback I've made it an atomic_t and stopped
using it as an id for all the print messages.


> If there are supposed to be locks to protect globals in the MPAM driver,
> is it worth wrapping them in access functions with a lockdep assert?
> Otherwise, it feels rather easy to get this wrong -- I think I've found
> at least one bug (see mpam_msc_drv_probe().)
Broadly: everything is protected by the list_lock when things are being discovered.
Once everything has been discovered, these things can become read-only.

It's not until everything has been discovered that interrupts get registered, and things
like a potential PMU driver could make calls in a strange context.

Adding helpers would need some global state variable, if (state == foo) lockdep_assert...
I had that early on, but figured it was overkill.


>> +static void mpam_discovery_complete(void)
>> +{
>> +	pr_err("Discovered all MSC\n");
>> +}

> As others have commented, if this is non-functional code that gets
> removed later on, it's probably best to drop this up-front?
It's illustrating that something happens after all the MSC have been discovered.
Knowing that from the beginning of the series is supposed to make the insertion of the
cpuhp notifiers in the middle easier to think about...


>> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
>> +				  u32 ris_idx)
>> +{
>> +	int err = 0;
>> +	u32 level = 0;
>> +	unsigned long cache_id;
>> +	struct device_node *cache;
>> +
>> +	do {
>> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
>> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
>> +			if (!cache) {
>> +				pr_err("Failed to read phandle\n");
>> +				break;
>> +			}
>> +		} else if (of_device_is_compatible(np->parent, "cache")) {
>> +			cache = of_node_get(np->parent);
>> +		} else {
>> +			/* For now, only caches are supported */
>> +			cache = NULL;
>> +			break;
>> +		}
>> +
>> +		err = of_property_read_u32(cache, "cache-level", &level);
>> +		if (err) {
>> +			pr_err("Failed to read cache-level\n");
>> +			break;
>> +		}
>> +
>> +		cache_id = cache_of_calculate_id(cache);
>> +		if (cache_id == ~0UL) {
>
> The type of cache_id may change if the return type of
> cache_of_calculate_id() changes (see comments on patch 1).

Yup,

> Possible #define for the exceptional value.

I don't think its any more surprising than '-1' as an error value, and its only got one
caller, which is pretty obviously an error path.


>> +			err = -ENOENT;
>> +			break;
>
> The lack of a diagnostic here is inconsistent with the level of
> diagnostics in the rest of the loop.

I've never needed to debug that one because its already visible to user-space. If the
cache-id's are missing, you can tell that in sysfs, you don't need to instrument the kernel.

I'll add one if you think its important. They can all be _once, and as its related to
the probing of a particular device, can use the dev_ print helpers.


>> +		}
>> +
>> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
>> +				      cache_id);
>> +	} while (0);
>
> Abuse of do ... while () here?
>
> There is no loop.  The breaks are stealth "goto"s to this statement:
Yes. The alternative would be actual gotos - which is surely worse!
It just wasn't worth pulling this out as a separate function.


>> +	of_node_put(cache);
>
> (It works either way, but maybe gotos to an explicit label would be
> more readable, as well as avoiding an unnecessary level of indentation.)
As the cleanup magic has become fashionable, I'll switch to using that...


>> +
>> +	return err;
>> +}
>
> [...]
>
>> +/*
>> + * An MSC can control traffic from a set of CPUs, but may only be accessible
>> + * from a (hopefully wider) set of CPUs. The common reason for this is power
>> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
>> + * the corresponding cache may also be powered off. By making accesses from
>
> Nit: the the

Fixed,


>> + * one of those CPUs, we ensure this isn't the case.
>> + */

>> +static int fw_num_msc;
>
> Does this need to be protected against data races?
>
> If individual mpam_msc_drv_probe() calls may execute on different CPUs
> from mpam_msc_driver_init(), then seem to be potential races here.

Incrementing was under the list-lock, but not the last 'are they all done' read. Following
Rob's comments I've made this an atomic_t.


>> +
>> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
>> +{
>> +	/* TODO: wake up tasks blocked on this MSC's PCC channel */
>
> So, is this broken in this commit?>
> (If the series does not get broken up or applied piecemail, that's not
> such a concern, though.)
Unsupported - or at least only enough to not mistake them for MMIO devices.
I've pulled this out to a later patch in the tree that isn't in this series.

The platforms that need this haven't yet materialised, (and may not!).
There is a prototype for DT/SCMI, but nothing I've seen yet for ACPI.


>> +}
>> +
>> +static void mpam_msc_drv_remove(struct platform_device *pdev)
>> +{
>
> The MPAM driver cannot currently be built as a module.
>
> Is it possible to exercise the driver remove paths, today?
Yes, through the sysfs unbind interface.

It doesn't make a lot of sense for MPAM as the moment you unbind the driver from one MSC
it has to work out if it needs to teardown resctrl...


>> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
>> +
>> +	if (!msc)
>> +		return;
>> +
>> +	mutex_lock(&mpam_list_lock);
>> +	mpam_num_msc--;
>> +	platform_set_drvdata(pdev, NULL);
>> +	list_del_rcu(&msc->glbl_list);
>> +	synchronize_srcu(&mpam_srcu);
>> +	devm_kfree(&pdev->dev, msc);
>> +	mutex_unlock(&mpam_list_lock);
>> +}
>> +
>> +static int mpam_msc_drv_probe(struct platform_device *pdev)
>> +{
>> +	int err;
>> +	struct mpam_msc *msc;
>> +	struct resource *msc_res;
>> +	void *plat_data = pdev->dev.platform_data;
>> +
>> +	mutex_lock(&mpam_list_lock);
>> +	do {
>> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
>> +		if (!msc) {
>> +			err = -ENOMEM;
>> +			break;
>> +		}
>> +
>> +		mutex_init(&msc->probe_lock);
>> +		mutex_init(&msc->part_sel_lock);
>> +		mutex_init(&msc->outer_mon_sel_lock);
>> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
>> +		msc->id = mpam_num_msc++;
>> +		msc->pdev = pdev;
>> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);
>> +		INIT_LIST_HEAD_RCU(&msc->ris);
>> +
>> +		err = update_msc_accessibility(msc);
>> +		if (err)
>> +			break;
>> +		if (cpumask_empty(&msc->accessibility)) {
>> +			pr_err_once("msc:%u is not accessible from any CPU!",
>> +				    msc->id);
>> +			err = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
>> +					     &msc->pcc_subspace_id))
>> +			msc->iface = MPAM_IFACE_MMIO;
>> +		else
>> +			msc->iface = MPAM_IFACE_PCC;
>> +
>> +		if (msc->iface == MPAM_IFACE_MMIO) {
>> +			void __iomem *io;
>> +
>> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
>> +								    &msc_res);
>> +			if (IS_ERR(io)) {
>> +				pr_err("Failed to map MSC base address\n");
>> +				err = PTR_ERR(io);
>> +				break;
>> +			}
>> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
>> +			msc->mapped_hwpage = io;
>> +		} else if (msc->iface == MPAM_IFACE_PCC) {
>> +			msc->pcc_cl.dev = &pdev->dev;
>> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
>> +			msc->pcc_cl.tx_block = false;
>> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
>> +			msc->pcc_cl.knows_txdone = false;
>> +
>> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
>> +								 msc->pcc_subspace_id);
>> +			if (IS_ERR(msc->pcc_chan)) {
>> +				pr_err("Failed to request MSC PCC channel\n");
>> +				err = PTR_ERR(msc->pcc_chan);
>> +				break;
>> +			}
>> +		}

> Should the lock be held across initialisation of the msc fields?

The msc isn't visible until its added to the list, so provided all that inialisation is
done 'before' its added to the list, it doesn't matter.


> list_add_rcu() might imply sufficient barriers to ensure that the
> initialisations are visible to other threads that obtain the msc
> pointer by iterating over mpam_all_msc.
>
> It's probably cleaner to hold the lock explicitly, though.

The list lock? We do.
But the readers don't need to take the list lock, its only there to prevent concurrent
writers.


> What other ways of obtaining the msc pointer exist?
The class/component/device structures in a subsequent patch, protected in the same way.
Once MPAM is enabled all that can be sprayed through resctrl - at which point no
modifications are allowed, and teardown for fatal errors depends on the static-key.


>> +
>> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
>> +		platform_set_drvdata(pdev, msc);
>> +	} while (0);
>> +	mutex_unlock(&mpam_list_lock);
>> +
>> +	if (!err) {
>> +		/* Create RIS entries described by firmware */
>> +		if (!acpi_disabled)
>> +			err = acpi_mpam_parse_resources(msc, plat_data);
>> +		else
>> +			err = mpam_dt_parse_resources(msc, plat_data);
>> +	}
>> +
>> +	if (!err && fw_num_msc == mpam_num_msc)

> Unlocked read of mpam_num_msc?
Fixed as an atomic_t flavoured thing.


>> +		mpam_discovery_complete();
>> +
>> +	if (err && msc)
>> +		mpam_msc_drv_remove(pdev);
>> +
>> +	return err;
>> +}
>> +/*
>> + * MSC that are hidden under caches are not created as platform devices
>> + * as there is no cache driver. Caches are also special-cased in
>> + * update_msc_accessibility().
>> + */
>
> Can you elaborate?  I don't understand quite what this is doing.

/ {
    my_thing {
        compatible = "my_thing";
        msc {
           compatible = "arm,mpam-msc";
        };
    };

    other_thing {
        compatible = "other_thing";
    };

    msc {
       compatible = "arm,mpam-msc";
       arm,mpam-device = <&other_thing>;
    };


    l2-cache {
      compatible = "cache";
        msc {
           compatible = "arm,mpam-msc";
        };
    };
};

my_thing and other_thing's MSC will have devices created - but the cache will not, because
it's a cache not a device, and anything below it is ignored.


>> +static void mpam_dt_create_foundling_msc(void)
>> +{
>> +	int err;
>> +	struct device_node *cache;
>> +
>> +	for_each_compatible_node(cache, NULL, "cache") {
>> +		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
>> +		if (err)
>> +			pr_err("Failed to create MSC devices under caches\n");
>> +	}
>> +}
>
> [...]
>
>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> new file mode 100644
>> index 000000000000..07e0f240eaca
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -0,0 +1,62 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +// Copyright (C) 2024 Arm Ltd.
>> +
>> +#ifndef MPAM_INTERNAL_H
>> +#define MPAM_INTERNAL_H
>> +
>> +#include <linux/arm_mpam.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/io.h>
>> +#include <linux/mailbox_client.h>
>> +#include <linux/mutex.h>
>> +#include <linux/resctrl.h>
>> +#include <linux/sizes.h>
>> +
>> +struct mpam_msc {
>> +	/* member of mpam_all_msc */
>> +	struct list_head        glbl_list;

> It is worth making these names less mismatched?

all_msc_list ?

It's because its global. The pattern otherwise is parent has a list foo, and all the
children have a member 'foo_list'.


>> +
>> +	int			id;
>> +	struct platform_device *pdev;
>> +
>> +	/* Not modified after mpam_is_enabled() becomes true */
>> +	enum mpam_msc_iface	iface;
>> +	u32			pcc_subspace_id;
>> +	struct mbox_client	pcc_cl;
>> +	struct pcc_mbox_chan	*pcc_chan;
>> +	u32			nrdy_usec;
>> +	cpumask_t		accessibility;
>> +
>> +	/*
>> +	 * probe_lock is only take during discovery. After discovery these
>> +	 * properties become read-only and the lists are protected by SRCU.
>> +	 */
>> +	struct mutex		probe_lock;
>
> Can we have more clarify about the locking strategy, including details
> of which things each lock is supposed to apply to and when, and how (if
> at all) the locks are intended to nest?

The comment above is supposed to describe that. A rule of thumb is its all the stuff
'below' it in the struct, there is also a comment for each order. This one is a bit of a
catch all for the values in this struct that can be written.

The purpose of the part_sel_lock and mon_sel_lock arrangement becomes much more obvious in
subsequent patches when the driver starts accessing the registers of those names.


> (Similarly for the global locks.)
>
>> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];
>> +	u32			ris_max;

> nrdy_usec, ris_idxs and ris_max appear unused in this patch (though I
> suppose they get initialised by virtue of kzalloc()).  Is this
> intentional?

To reduce the amount of churn in the series the bulk of the structure is added here, then
the stuff that build more complicated structures, accesses hardware, deals with interrupts
etc.


>> +
>> +	/* mpam_msc_ris of this component */
>> +	struct list_head	ris;
>> +
>> +	/*
>> +	 * part_sel_lock protects access to the MSC hardware registers that are
>> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>> +	 * by RIS).
>> +	 * If needed, take msc->lock first.
>> +	 */

> What's msc->lock ?

The old name for probe_lock, before it was necessary to lock the hardware registers
separately. I've fixed the name.



Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Dave Martin 5 months ago

On Fri, Sep 05, 2025 at 07:49:37PM +0100, James Morse wrote:
> Hi Dave,
> 
> On 01/09/2025 12:21, Dave Martin wrote:
> > On Fri, Aug 22, 2025 at 03:29:51PM +0000, James Morse wrote:
> >> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> >> only be accessible from those CPUs, and they may not be online.
> >> Touching the hardware early is pointless as MPAM can't be used until
> >> the system-wide common values for num_partid and num_pmg have been
> >> discovered.
> >>
> >> Start with driver probe/remove and mapping the MSC.
> >> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
> >> new file mode 100644
> >> index 000000000000..dff7b87280ab
> >> --- /dev/null
> >> +++ b/drivers/resctrl/Kconfig
> >> @@ -0,0 +1,11 @@
> >> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
> >> +# CPU resources, not containers or cgroups etc.
> >
> > Drop confusing comment?
> >
> > CPUs are not mentioned other than in the comment -- I think the
> > descriptions are sufficiently self-explanatory that they don't read
> > onto CPUs.
> 
> This used to add ARM_CPU_RESCTRL, to mirror X86_CPU_RESCTRL.
> It's been tidied up since then, but the comment remains.
> I'll remove it.

OK

> >> +config ARM64_MPAM_DRIVER
> >> +	bool "MPAM driver for System IP, e,g. caches and memory controllers"
> >> +	depends on ARM64_MPAM && EXPERT
> >> +
> >> +config ARM64_MPAM_DRIVER_DEBUG
> >> +	bool "Enable debug messages from the MPAM driver."
> >
> > Nit: spurious full stop.
> >
> > (i.e., people don't add one in these one-line descriptions.
> > They are title-like and self-delimiting, even when the text is a valid
> > sentence.)
> 
> /me waves hands around

I did say "Nit" ;)  (This was mainly a "hmm, this doesn't look quite
like the rest" thing.)


> >> +	depends on ARM64_MPAM_DRIVER
> >> +	help
> >> +	  Say yes here to enable debug messages from the MPAM driver.
> 
> >> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> >> new file mode 100644
> >> index 000000000000..a0d9a699a6e7
> >> --- /dev/null
> >> +++ b/drivers/resctrl/mpam_devices.c
> >> @@ -0,0 +1,336 @@
> 
> >> +/*
> >> + * mpam_list_lock protects the SRCU lists when writing. Once the
> >> + * mpam_enabled key is enabled these lists are read-only,
> >> + * unless the error interrupt disables the driver.
> >> + */
> >> +static DEFINE_MUTEX(mpam_list_lock);
> >> +static LIST_HEAD(mpam_all_msc);
> >> +
> >> +static struct srcu_struct mpam_srcu;
> >> +
> >> +/* MPAM isn't available until all the MSC have been probed. */
> >
> > Comment doesn't really explain the variable.
> >
> > Maybe something like "Number of MSCs that need to be probed for MPAM
> > to be usable" ?
> 
> Its the count not the remainder. I went with:
> | * Number of MSCs that have been probed. Once all MSC have been probed MPAM
> | * can be enabled.

Thanks, that's clearer.

> >> +static u32 mpam_num_msc;
> >
> > Any particular reason this is u32 and not unsigned int?
> u32 is less typing!

But more effort for the reader / reviewer -- it's semantic noise.

(It works either way though, obviosly.  It looks like this code may
have caught fixed-size-itis off the resctrl code, to some degree.)


> > How are accesses to this protected against data races?
> 
> It's under the list-lock, but after Rob's feedback I've made it an atomic_t and stopped
> using it as an id for all the print messages.

Converting to atomic_t reduces the chance of people asking the
question, but doesn't really answer the question.

Since mpam_num_msc shadows the contents of the lists and msc data
structures, it may matter whether the two can be seen out of sync.

Does it definitely not matter?

> > If there are supposed to be locks to protect globals in the MPAM driver,
> > is it worth wrapping them in access functions with a lockdep assert?
> > Otherwise, it feels rather easy to get this wrong -- I think I've found
> > at least one bug (see mpam_msc_drv_probe().)
> Broadly: everything is protected by the list_lock when things are being discovered.
> Once everything has been discovered, these things can become read-only.
> 
> It's not until everything has been discovered that interrupts get registered, and things
> like a potential PMU driver could make calls in a strange context.
> 
> Adding helpers would need some global state variable, if (state == foo) lockdep_assert...
> I had that early on, but figured it was overkill.

I wonder whether it would be worth migrating this, so that the probe-
time variables (which are read-write) can be kept separate from the
run-time system description variables (which are mostly write-once).

This would avoid having to support two different locking scenarios with
a single mechanism.


I have a bit of a concern that there are too many synchronisation
mechanisms in use, with purposes that overlap and are in some cases
not clealy described and not obvious -- at least, not to me.

I don't think the fact that there have been few comments on this area
necessarily indicates that other reviewers have fully understood the
locking.

> >> +static void mpam_discovery_complete(void)
> >> +{
> >> +	pr_err("Discovered all MSC\n");
> >> +}
> 
> > As others have commented, if this is non-functional code that gets
> > removed later on, it's probably best to drop this up-front?
> It's illustrating that something happens after all the MSC have been discovered.
> Knowing that from the beginning of the series is supposed to make the insertion of the
> cpuhp notifiers in the middle easier to think about...

So long as this was not unintentionally left behind when splitting the
series, I guess it's OK to have it here -- as you say, it does motivate
the shape that the code will eventually need to have.

> >> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
> >> +				  u32 ris_idx)
> >> +{
> >> +	int err = 0;
> >> +	u32 level = 0;
> >> +	unsigned long cache_id;
> >> +	struct device_node *cache;
> >> +
> >> +	do {
> >> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
> >> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
> >> +			if (!cache) {
> >> +				pr_err("Failed to read phandle\n");
> >> +				break;
> >> +			}
> >> +		} else if (of_device_is_compatible(np->parent, "cache")) {
> >> +			cache = of_node_get(np->parent);
> >> +		} else {
> >> +			/* For now, only caches are supported */
> >> +			cache = NULL;
> >> +			break;
> >> +		}
> >> +
> >> +		err = of_property_read_u32(cache, "cache-level", &level);
> >> +		if (err) {
> >> +			pr_err("Failed to read cache-level\n");
> >> +			break;
> >> +		}
> >> +
> >> +		cache_id = cache_of_calculate_id(cache);
> >> +		if (cache_id == ~0UL) {
> >
> > The type of cache_id may change if the return type of
> > cache_of_calculate_id() changes (see comments on patch 1).
> 
> Yup,
> 
> > Possible #define for the exceptional value.
> 
> I don't think its any more surprising than '-1' as an error value, and its only got one
> caller, which is pretty obviously an error path.

A #define is not essential.  (As I say elsewhere, I don't entirely
trust uses of ~ where the types may be mixed.  But this case look low-
isk for future maintenance.)

> >> +			err = -ENOENT;
> >> +			break;
> >
> > The lack of a diagnostic here is inconsistent with the level of
> > diagnostics in the rest of the loop.
> 
> I've never needed to debug that one because its already visible to user-space. If the
> cache-id's are missing, you can tell that in sysfs, you don't need to instrument the kernel.
> 
> I'll add one if you think its important. They can all be _once, and as its related to
> the probing of a particular device, can use the dev_ print helpers.

No, I guess that's fine.  I don't have a strong feel for which things a
user is likely to have to debug in practice, and I accept that we
shouldn't try to cover absolutely everything.

> >> +		}
> >> +
> >> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
> >> +				      cache_id);
> >> +	} while (0);
> >
> > Abuse of do ... while () here?
> >
> > There is no loop.  The breaks are stealth "goto"s to this statement:
> Yes. The alternative would be actual gotos - which is surely worse!
> It just wasn't worth pulling this out as a separate function.

Again, it's semantic noise.  do ... while is a looping construct.
If you write "do {", the reader expects a loop and has to analyse the
code in order to conclude that there isn't.

Using a loop in order to avoid giving a label to the sequence point at
the closing bracket is a neat trick, but it's not helpful to someone
reading the code.  I can live with it, but as an idiom this seems rare.

Use of gotos for doing cleanup seems to be the most common idiom for
this in the kernel.  It may be inelegant, but it is likely to be readily
understood.  However...

> 
> 
> >> +	of_node_put(cache);
> >
> > (It works either way, but maybe gotos to an explicit label would be
> > more readable, as well as avoiding an unnecessary level of indentation.)
> As the cleanup magic has become fashionable, I'll switch to using that...

...I guess that works, too.

(Not that I much like that clunky bolt-on extension to the language,
but it should do the job and at least avoids arguments about precisely
how or where the cleanup happens.)

> >> +
> >> +	return err;
> >> +}
> >
> > [...]
> >
> >> +/*
> >> + * An MSC can control traffic from a set of CPUs, but may only be accessible
> >> + * from a (hopefully wider) set of CPUs. The common reason for this is power
> >> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
> >> + * the corresponding cache may also be powered off. By making accesses from
> >
> > Nit: the the
> 
> Fixed,
> 
> 
> >> + * one of those CPUs, we ensure this isn't the case.
> >> + */
> 
> >> +static int fw_num_msc;
> >
> > Does this need to be protected against data races?
> >
> > If individual mpam_msc_drv_probe() calls may execute on different CPUs
> > from mpam_msc_driver_init(), then seem to be potential races here.
> 
> Incrementing was under the list-lock, but not the last 'are they all done' read. Following
> Rob's comments I've made this an atomic_t.

As with mpam_num_msc, this eliminates data races on fw_num_msc, but
races between this variable and other data structures may remain.

Can you explain what prevents such races, or why they don't
matter?

> 
> 
> >> +
> >> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
> >> +{
> >> +	/* TODO: wake up tasks blocked on this MSC's PCC channel */
> >
> > So, is this broken in this commit?>
> > (If the series does not get broken up or applied piecemail, that's not
> > such a concern, though.)
> Unsupported - or at least only enough to not mistake them for MMIO devices.
> I've pulled this out to a later patch in the tree that isn't in this series.
> 
> The platforms that need this haven't yet materialised, (and may not!).
> There is a prototype for DT/SCMI, but nothing I've seen yet for ACPI.

OK; I guess deferring the introduction of this until there is more
context for it makes sense.

> >> +}
> >> +
> >> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> >> +{
> >
> > The MPAM driver cannot currently be built as a module.
> >
> > Is it possible to exercise the driver remove paths, today?
> Yes, through the sysfs unbind interface.
> 
> It doesn't make a lot of sense for MPAM as the moment you unbind the driver from one MSC
> it has to work out if it needs to teardown resctrl...

Has to, but doesn't?  Have I missed something?

Are we supposed to put the MSC back into a sane state, for e.g. the
kexec path?

For resctrl, one option would be to stub out the backend -- i.e.,
we don't tell resctrl that the affected resources disappeared, but
attempts to manipulate the affected MSC(s) are stubbed out (similarly
to what happens to an open tty after a hangup).

(Something along these lines may be done somewhere, but I'm not
currently aware of it.)


Is there an outstanding get on the device that prevents us from getting
here until resctrl is shut down?  Thanks to the wisdom and restraint of
systemd I'd expect resctrl to tangled up in some rat's nest of
unremovable mounts by the time we try to shut down, but I hope I'm
being pessimistic.  (Arguably that's not our bug, if so.)

> >> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
> >> +
> >> +	if (!msc)
> >> +		return;
> >> +
> >> +	mutex_lock(&mpam_list_lock);
> >> +	mpam_num_msc--;
> >> +	platform_set_drvdata(pdev, NULL);
> >> +	list_del_rcu(&msc->glbl_list);
> >> +	synchronize_srcu(&mpam_srcu);
> >> +	devm_kfree(&pdev->dev, msc);
> >> +	mutex_unlock(&mpam_list_lock);
> >> +}
> >> +
> >> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> >> +{
> >> +	int err;
> >> +	struct mpam_msc *msc;
> >> +	struct resource *msc_res;
> >> +	void *plat_data = pdev->dev.platform_data;
> >> +
> >> +	mutex_lock(&mpam_list_lock);
> >> +	do {
> >> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
> >> +		if (!msc) {
> >> +			err = -ENOMEM;
> >> +			break;
> >> +		}
> >> +
> >> +		mutex_init(&msc->probe_lock);
> >> +		mutex_init(&msc->part_sel_lock);
> >> +		mutex_init(&msc->outer_mon_sel_lock);
> >> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
> >> +		msc->id = mpam_num_msc++;
> >> +		msc->pdev = pdev;
> >> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);

msc->glbl_list is not a list head?  Does it need to be initialised at all?
list_add_rcu() will just splat it, by the looks of it.

> >> +		INIT_LIST_HEAD_RCU(&msc->ris);

Maybe INIT_LIST_HEAD_RCU() isn't needed here.  Do we ever access this
list without holding one of the MSC locks?

This list is not used until a cpuhp hook comes in to probe the MSC, and
then mpam_discovery_cpu_online() obtains a pointer via
list_for_each_entry() -- but this is not RCU-protected.  The MSC probe
lock is taken, there...

> >> +
> >> +		err = update_msc_accessibility(msc);
> >> +		if (err)
> >> +			break;
> >> +		if (cpumask_empty(&msc->accessibility)) {
> >> +			pr_err_once("msc:%u is not accessible from any CPU!",
> >> +				    msc->id);
> >> +			err = -EINVAL;
> >> +			break;
> >> +		}
> >> +
> >> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
> >> +					     &msc->pcc_subspace_id))
> >> +			msc->iface = MPAM_IFACE_MMIO;
> >> +		else
> >> +			msc->iface = MPAM_IFACE_PCC;
> >> +
> >> +		if (msc->iface == MPAM_IFACE_MMIO) {
> >> +			void __iomem *io;
> >> +
> >> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
> >> +								    &msc_res);
> >> +			if (IS_ERR(io)) {
> >> +				pr_err("Failed to map MSC base address\n");
> >> +				err = PTR_ERR(io);
> >> +				break;
> >> +			}
> >> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
> >> +			msc->mapped_hwpage = io;
> >> +		} else if (msc->iface == MPAM_IFACE_PCC) {
> >> +			msc->pcc_cl.dev = &pdev->dev;
> >> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
> >> +			msc->pcc_cl.tx_block = false;
> >> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
> >> +			msc->pcc_cl.knows_txdone = false;

... so, it feels like we may need to hold the probe lock, or ensure
that all iterations over the msc list are RCU-protected (see below for
counterexamples), or both.

> >> +
> >> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
> >> +								 msc->pcc_subspace_id);
> >> +			if (IS_ERR(msc->pcc_chan)) {
> >> +				pr_err("Failed to request MSC PCC channel\n");
> >> +				err = PTR_ERR(msc->pcc_chan);
> >> +				break;
> >> +			}
> >> +		}
> 
> > Should the lock be held across initialisation of the msc fields?
> 
> The msc isn't visible until its added to the list, so provided all that inialisation is
> done 'before' its added to the list, it doesn't matter.

Is it possible for other code to get a pointer to the new msc after
this, other than by dereferencing the list?

(The obvious case is the interrupt handlers, but it looks like the msc
pointers used for registering the interrupts are indeed obtained
through an RCU-protected iteration over mpam_all_msc.)

Note, there seem to be non-RCU-protected iterations over mpam_all_msc
in the mpam_discovery_cpu_online() (patch 14) and
mpam_enable_merge_features() paths (patch 18).  The lack of symmetry
between list maintenance and consumption look a little suspect for
those -- is safety ensured in some other way?

> > list_add_rcu() might imply sufficient barriers to ensure that the
> > initialisations are visible to other threads that obtain the msc
> > pointer by iterating over mpam_all_msc.
> >
> > It's probably cleaner to hold the lock explicitly, though.
> 
> The list lock? We do.
> But the readers don't need to take the list lock, its only there to prevent concurrent
> writers.

I meant whatever lock is supposed to protect the fields of the specific
msc.  The list lock is not that lock.

> 
> 
> > What other ways of obtaining the msc pointer exist?
> The class/component/device structures in a subsequent patch, protected in the same way.
> Once MPAM is enabled all that can be sprayed through resctrl - at which point no
> modifications are allowed, and teardown for fatal errors depends on the static-key.

I'll bear this in mind as I review.

> >> +
> >> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
> >> +		platform_set_drvdata(pdev, msc);
> >> +	} while (0);
> >> +	mutex_unlock(&mpam_list_lock);
> >> +
> >> +	if (!err) {
> >> +		/* Create RIS entries described by firmware */
> >> +		if (!acpi_disabled)
> >> +			err = acpi_mpam_parse_resources(msc, plat_data);
> >> +		else
> >> +			err = mpam_dt_parse_resources(msc, plat_data);
> >> +	}
> >> +
> >> +	if (!err && fw_num_msc == mpam_num_msc)
> 
> > Unlocked read of mpam_num_msc?
> Fixed as an atomic_t flavoured thing.

Ditto comments above about whether it is a problem that mpam_num_msc
can now be seen out of sync with the list.

> >> +		mpam_discovery_complete();
> >> +
> >> +	if (err && msc)
> >> +		mpam_msc_drv_remove(pdev);
> >> +
> >> +	return err;
> >> +}
> >> +/*
> >> + * MSC that are hidden under caches are not created as platform devices
> >> + * as there is no cache driver. Caches are also special-cased in
> >> + * update_msc_accessibility().
> >> + */
> >
> > Can you elaborate?  I don't understand quite what this is doing.
> 
> / {
>     my_thing {
>         compatible = "my_thing";
>         msc {
>            compatible = "arm,mpam-msc";
>         };
>     };
> 
>     other_thing {
>         compatible = "other_thing";
>     };
> 
>     msc {
>        compatible = "arm,mpam-msc";
>        arm,mpam-device = <&other_thing>;
>     };
> 
> 
>     l2-cache {
>       compatible = "cache";
>         msc {
>            compatible = "arm,mpam-msc";
>         };
>     };
> };
> 
> my_thing and other_thing's MSC will have devices created - but the cache will not, because
> it's a cache not a device, and anything below it is ignored.

OK.  Maybe reword as something like:

--8<--

MSCs that are declared by the firmware as being part of a cache may not
be created automatically as platform devices, since there is no
dedicated cache driver.

Deal with those MSCs here.

-->8--

Maybe add a comment at update_msc_accessibility() that references this
comment, instead of a reader of that function just needing to know that
this comment is here?

> 
> 
> >> +static void mpam_dt_create_foundling_msc(void)
> >> +{
> >> +	int err;
> >> +	struct device_node *cache;
> >> +
> >> +	for_each_compatible_node(cache, NULL, "cache") {
> >> +		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
> >> +		if (err)
> >> +			pr_err("Failed to create MSC devices under caches\n");
> >> +	}
> >> +}
> >
> > [...]
> >
> >> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> >> new file mode 100644
> >> index 000000000000..07e0f240eaca
> >> --- /dev/null
> >> +++ b/drivers/resctrl/mpam_internal.h
> >> @@ -0,0 +1,62 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +// Copyright (C) 2024 Arm Ltd.
> >> +
> >> +#ifndef MPAM_INTERNAL_H
> >> +#define MPAM_INTERNAL_H
> >> +
> >> +#include <linux/arm_mpam.h>
> >> +#include <linux/cpumask.h>
> >> +#include <linux/io.h>
> >> +#include <linux/mailbox_client.h>
> >> +#include <linux/mutex.h>
> >> +#include <linux/resctrl.h>
> >> +#include <linux/sizes.h>
> >> +
> >> +struct mpam_msc {
> >> +	/* member of mpam_all_msc */
> >> +	struct list_head        glbl_list;
> 
> > It is worth making these names less mismatched?
> 
> all_msc_list ?
> 
> It's because its global. The pattern otherwise is parent has a list foo, and all the
> children have a member 'foo_list'.

"all_msc", then?

I'm not sure that it's essential for the list head's name to have
"list" in it: that's clear from the type and from how it is used.
"all" seems sufficient to imply that this is a list (?)

It doesn't get much more global than "all".

(This is purely cosmetic, of course.)

> 
> 
> >> +
> >> +	int			id;
> >> +	struct platform_device *pdev;
> >> +
> >> +	/* Not modified after mpam_is_enabled() becomes true */
> >> +	enum mpam_msc_iface	iface;
> >> +	u32			pcc_subspace_id;
> >> +	struct mbox_client	pcc_cl;
> >> +	struct pcc_mbox_chan	*pcc_chan;
> >> +	u32			nrdy_usec;
> >> +	cpumask_t		accessibility;
> >> +
> >> +	/*
> >> +	 * probe_lock is only take during discovery. After discovery these
> >> +	 * properties become read-only and the lists are protected by SRCU.
> >> +	 */
> >> +	struct mutex		probe_lock;
> >
> > Can we have more clarify about the locking strategy, including details
> > of which things each lock is supposed to apply to and when, and how (if
> > at all) the locks are intended to nest?
> 
> The comment above is supposed to describe that. A rule of thumb is its all the stuff
> 'below' it in the struct, there is also a comment for each order. This one is a bit of a
> catch all for the values in this struct that can be written.

I get that, but the devil is in the detail.

A lock that is "a bit of a catch-all" would always need to be taken,
if safety is the goal.

I guess we should try to close out the other discussions about locking
before working out whether anything else is needed, here.

> The purpose of the part_sel_lock and mon_sel_lock arrangement becomes much more obvious in
> subsequent patches when the driver starts accessing the registers of those names.

It's fair enough to defer that context until later -- though arguably
these could be added later when they are actually used.  Not really
worth resplitting the series just for that, though.

> 
> 
> > (Similarly for the global locks.)
> >
> >> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];
> >> +	u32			ris_max;
> 
> > nrdy_usec, ris_idxs and ris_max appear unused in this patch (though I
> > suppose they get initialised by virtue of kzalloc()).  Is this
> > intentional?
> 
> To reduce the amount of churn in the series the bulk of the structure is added here, then
> the stuff that build more complicated structures, accesses hardware, deals with interrupts
> etc.

Ack, just checking that something wasn't unintentionally dropped.

> >> +
> >> +	/* mpam_msc_ris of this component */
> >> +	struct list_head	ris;
> >> +
> >> +	/*
> >> +	 * part_sel_lock protects access to the MSC hardware registers that are
> >> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
> >> +	 * by RIS).
> >> +	 * If needed, take msc->lock first.
> >> +	 */
> 
> > What's msc->lock ?
> 
> The old name for probe_lock, before it was necessary to lock the hardware registers
> separately. I've fixed the name.
> 
> Thanks,
> 
> James

Ack

Cheers
---Dave
> 
>

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Dave,

On 08/09/2025 16:25, Dave Martin wrote:
> On Fri, Sep 05, 2025 at 07:49:37PM +0100, James Morse wrote:
>> On 01/09/2025 12:21, Dave Martin wrote:
>>> On Fri, Aug 22, 2025 at 03:29:51PM +0000, James Morse wrote:
>>>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>>>> only be accessible from those CPUs, and they may not be online.
>>>> Touching the hardware early is pointless as MPAM can't be used until
>>>> the system-wide common values for num_partid and num_pmg have been
>>>> discovered.
>>>>
>>>> Start with driver probe/remove and mapping the MSC.


>>>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>>>> new file mode 100644
>>>> index 000000000000..a0d9a699a6e7
>>>> --- /dev/null
>>>> +++ b/drivers/resctrl/mpam_devices.c
>>>> @@ -0,0 +1,336 @@

>>> How are accesses to this protected against data races?
>>
>> It's under the list-lock, but after Rob's feedback I've made it an atomic_t and stopped
>> using it as an id for all the print messages.
> 
> Converting to atomic_t reduces the chance of people asking the
> question, but doesn't really answer the question.
> 
> Since mpam_num_msc shadows the contents of the lists and msc data
> structures, it may matter whether the two can be seen out of sync.
> 
> Does it definitely not matter?

v2 uses the firmware-table id as the id. The only thing mpam_num_msc needs to do is spot
which MSC was last to be probed so that the cpuhp calls can be registered.

The amount of skid that happens on the way there doesn't matter.

[..]

>>>> +static int fw_num_msc;
>>>
>>> Does this need to be protected against data races?
>>>
>>> If individual mpam_msc_drv_probe() calls may execute on different CPUs
>>> from mpam_msc_driver_init(), then seem to be potential races here.
>>
>> Incrementing was under the list-lock, but not the last 'are they all done' read. Following
>> Rob's comments I've made this an atomic_t.
> 
> As with mpam_num_msc, this eliminates data races on fw_num_msc, but
> races between this variable and other data structures may remain.
> 
> Can you explain what prevents such races, or why they don't
> matter?

See mpam_msc_driver_init(). The value is set before the driver is registered. It can't
probe before its registered, so all the readers must happen after the writer.

[..]


>>>> +}
>>>> +
>>>> +static void mpam_msc_drv_remove(struct platform_device *pdev)
>>>> +{
>>>
>>> The MPAM driver cannot currently be built as a module.
>>>
>>> Is it possible to exercise the driver remove paths, today?
>> Yes, through the sysfs unbind interface.
>>
>> It doesn't make a lot of sense for MPAM as the moment you unbind the driver from one MSC
>> it has to work out if it needs to teardown resctrl...
> 
> Has to, but doesn't?  Have I missed something?

Has to, and does:
https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/resctrl/mpam_devices.c?h=mpam/snapshot%2bextras/v6.17-rc2&id=41c768c1705d3fa0814bb1cb92c83280c872cb3e#n474

It's probably not what the user wants, but it is what they're asking for if they do this.


> Are we supposed to put the MSC back into a sane state, for e.g. the
> kexec path?

kdump on panic() means you couldn't trust it to do this.
The driver reset all the configurations during probing. It will initially run with
whatever configuration was left in partid-0, there is very little we can do about this.


> For resctrl, one option would be to stub out the backend -- i.e.,
> we don't tell resctrl that the affected resources disappeared, but
> attempts to manipulate the affected MSC(s) are stubbed out (similarly
> to what happens to an open tty after a hangup).
> 
> (Something along these lines may be done somewhere, but I'm not
> currently aware of it.)

On teardown the cpuhp callbacks are unregistered, which makes all the CPUs and
resctrl:domains appear offline, and the static key behind mpam_enabled() is disabled,
which makes a bunch of paths return an error.

The result is no more access to the hardware after an error has occured. The aim
is to prevent accidentally muddling important/unimportant tasks due to PARTID truncation.


> Is there an outstanding get on the device that prevents us from getting
> here until resctrl is shut down?  Thanks to the wisdom and restraint of
> systemd I'd expect resctrl to tangled up in some rat's nest of
> unremovable mounts by the time we try to shut down, but I hope I'm
> being pessimistic.  (Arguably that's not our bug, if so.)

The resctrl_exit() path removes the kernfs internals from the mount point. Systemd is able
to keep zombie mount points, but they end up being empty. (to reproduce this, put resctrl
in fstab!)


>>>> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
>>>> +
>>>> +	if (!msc)
>>>> +		return;
>>>> +
>>>> +	mutex_lock(&mpam_list_lock);
>>>> +	mpam_num_msc--;
>>>> +	platform_set_drvdata(pdev, NULL);
>>>> +	list_del_rcu(&msc->glbl_list);
>>>> +	synchronize_srcu(&mpam_srcu);
>>>> +	devm_kfree(&pdev->dev, msc);
>>>> +	mutex_unlock(&mpam_list_lock);
>>>> +}
>>>> +
>>>> +static int mpam_msc_drv_probe(struct platform_device *pdev)
>>>> +{
>>>> +	int err;
>>>> +	struct mpam_msc *msc;
>>>> +	struct resource *msc_res;
>>>> +	void *plat_data = pdev->dev.platform_data;
>>>> +
>>>> +	mutex_lock(&mpam_list_lock);
>>>> +	do {
>>>> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
>>>> +		if (!msc) {
>>>> +			err = -ENOMEM;
>>>> +			break;
>>>> +		}
>>>> +
>>>> +		mutex_init(&msc->probe_lock);
>>>> +		mutex_init(&msc->part_sel_lock);
>>>> +		mutex_init(&msc->outer_mon_sel_lock);
>>>> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
>>>> +		msc->id = mpam_num_msc++;
>>>> +		msc->pdev = pdev;
>>>> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);

> msc->glbl_list is not a list head?  Does it need to be initialised at all?
> list_add_rcu() will just splat it, by the looks of it.

It's the member of the global list, hence the comment above it in the struct.
I've never known if the member 'list head' needs to be initialised or not, better safe
than sorry.



>>>> +		INIT_LIST_HEAD_RCU(&msc->ris);
> 
> Maybe INIT_LIST_HEAD_RCU() isn't needed here.  Do we ever access this
> list without holding one of the MSC locks?
> 
> This list is not used until a cpuhp hook comes in to probe the MSC, and
> then mpam_discovery_cpu_online() obtains a pointer via
> list_for_each_entry() -- but this is not RCU-protected.  The MSC probe
> lock is taken, there...

mpam_reprogram_msc(), mpam_msmon_reset_all_mbwu() and mpam_reset_msc() all do this
under (s)rcu.

The aim is to not take a lock on the read side. Once the cpuhp callbacks are registered
the only thing that writes to these lists is the error interrupt teardown.


>>>> +
>>>> +		err = update_msc_accessibility(msc);
>>>> +		if (err)
>>>> +			break;
>>>> +		if (cpumask_empty(&msc->accessibility)) {
>>>> +			pr_err_once("msc:%u is not accessible from any CPU!",
>>>> +				    msc->id);
>>>> +			err = -EINVAL;
>>>> +			break;
>>>> +		}
>>>> +
>>>> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
>>>> +					     &msc->pcc_subspace_id))
>>>> +			msc->iface = MPAM_IFACE_MMIO;
>>>> +		else
>>>> +			msc->iface = MPAM_IFACE_PCC;
>>>> +
>>>> +		if (msc->iface == MPAM_IFACE_MMIO) {
>>>> +			void __iomem *io;
>>>> +
>>>> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
>>>> +								    &msc_res);
>>>> +			if (IS_ERR(io)) {
>>>> +				pr_err("Failed to map MSC base address\n");
>>>> +				err = PTR_ERR(io);
>>>> +				break;
>>>> +			}
>>>> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
>>>> +			msc->mapped_hwpage = io;
>>>> +		} else if (msc->iface == MPAM_IFACE_PCC) {
>>>> +			msc->pcc_cl.dev = &pdev->dev;
>>>> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
>>>> +			msc->pcc_cl.tx_block = false;
>>>> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
>>>> +			msc->pcc_cl.knows_txdone = false;
> 
> ... so, it feels like we may need to hold the probe lock, or ensure
> that all iterations over the msc list are RCU-protected (see below for
> counterexamples), or both.

In here, the msc hasn't yet been added to the global list, so it can't be found by anyone.
Anyone that does find it, found it by taking the list_lock and walking the global list.


>>>> +
>>>> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
>>>> +								 msc->pcc_subspace_id);
>>>> +			if (IS_ERR(msc->pcc_chan)) {
>>>> +				pr_err("Failed to request MSC PCC channel\n");
>>>> +				err = PTR_ERR(msc->pcc_chan);
>>>> +				break;
>>>> +			}
>>>> +		}
>>
>>> Should the lock be held across initialisation of the msc fields?
>>
>> The msc isn't visible until its added to the list, so provided all that inialisation is
>> done 'before' its added to the list, it doesn't matter.

> Is it possible for other code to get a pointer to the new msc after
> this, other than by dereferencing the list?

While MSC are still being created - nothing walks the list. There are no interrupts or
hotplug callabacks.
Once all the MSC have been found, hardware probing starts.
During hardware probing, the cpuhp callback walks the global list under srcu.
Once all the hardware has been probed, the cpuhp callbacks are swapped out, and
mpam_enable_once() does things with static_keys and registering interrupts.
After this point, the mpam_class list/tree is used under srcu, and yes interrupts know
which MSC they are for.


> (The obvious case is the interrupt handlers, but it looks like the msc
> pointers used for registering the interrupts are indeed obtained
> through an RCU-protected iteration over mpam_all_msc.)
> 
> Note, there seem to be non-RCU-protected iterations over mpam_all_msc
> in the mpam_discovery_cpu_online() (patch 14) and

That was taking the write side lock unecessarily. I've fixed it to walk the list under srcu.


> mpam_enable_merge_features() paths (patch 18). 

Holds the write side lock.


> The lack of symmetry
> between list maintenance and consumption look a little suspect for
> those -- is safety ensured in some other way?

Readers must use the RCU primitives to be safe against a concurrent writer.
Writers must take some write side lock to be safe against each other - they don't need to
use the RCU list-walking primitives.


>>> list_add_rcu() might imply sufficient barriers to ensure that the
>>> initialisations are visible to other threads that obtain the msc
>>> pointer by iterating over mpam_all_msc.
>>>
>>> It's probably cleaner to hold the lock explicitly, though.
>>
>> The list lock? We do.
>> But the readers don't need to take the list lock, its only there to prevent concurrent
>> writers.
> 
> I meant whatever lock is supposed to protect the fields of the specific
> msc.  The list lock is not that lock.

We could bundle that under the probe_lock - but there would be no need to hold that here,
the struct mpam_msc isn't reachable.

[..]

>>>> +		mpam_discovery_complete();
>>>> +
>>>> +	if (err && msc)
>>>> +		mpam_msc_drv_remove(pdev);
>>>> +
>>>> +	return err;
>>>> +}
>>>> +/*
>>>> + * MSC that are hidden under caches are not created as platform devices
>>>> + * as there is no cache driver. Caches are also special-cased in
>>>> + * update_msc_accessibility().
>>>> + */
>>>
>>> Can you elaborate?  I don't understand quite what this is doing.
>>
>> / {
>>     my_thing {
>>         compatible = "my_thing";
>>         msc {
>>            compatible = "arm,mpam-msc";
>>         };
>>     };
>>
>>     other_thing {
>>         compatible = "other_thing";
>>     };
>>
>>     msc {
>>        compatible = "arm,mpam-msc";
>>        arm,mpam-device = <&other_thing>;
>>     };
>>
>>
>>     l2-cache {
>>       compatible = "cache";
>>         msc {
>>            compatible = "arm,mpam-msc";
>>         };
>>     };
>> };
>>
>> my_thing and other_thing's MSC will have devices created - but the cache will not, because
>> it's a cache not a device, and anything below it is ignored.
> 
> OK.  Maybe reword as something like:
> 
> --8<--
> 
> MSCs that are declared by the firmware as being part of a cache may not
> be created automatically as platform devices, since there is no
> dedicated cache driver.
> 
> Deal with those MSCs here.
> 
> -->8--
> 
> Maybe add a comment at update_msc_accessibility() that references this
> comment, instead of a reader of that function just needing to know that
> this comment is here?

I'll just drop the bit about update_msc_accessibility(), its less relevant once the next
patch adds the memory node in a similar way. (and even less relevant once I rip out DT
support)


>>>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>>>> new file mode 100644
>>>> index 000000000000..07e0f240eaca
>>>> --- /dev/null
>>>> +++ b/drivers/resctrl/mpam_internal.h
>>>> @@ -0,0 +1,62 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +// Copyright (C) 2024 Arm Ltd.
>>>> +
>>>> +#ifndef MPAM_INTERNAL_H
>>>> +#define MPAM_INTERNAL_H
>>>> +
>>>> +#include <linux/arm_mpam.h>
>>>> +#include <linux/cpumask.h>
>>>> +#include <linux/io.h>
>>>> +#include <linux/mailbox_client.h>
>>>> +#include <linux/mutex.h>
>>>> +#include <linux/resctrl.h>
>>>> +#include <linux/sizes.h>
>>>> +
>>>> +struct mpam_msc {
>>>> +	/* member of mpam_all_msc */
>>>> +	struct list_head        glbl_list;
>>
>>> It is worth making these names less mismatched?
>>
>> all_msc_list ?
>>
>> It's because its global. The pattern otherwise is parent has a list foo, and all the
>> children have a member 'foo_list'.
> 
> "all_msc", then?
> 
> I'm not sure that it's essential for the list head's name to have
> "list" in it: that's clear from the type and from how it is used.
> "all" seems sufficient to imply that this is a list (?)
> 
> It doesn't get much more global than "all".
> 
> (This is purely cosmetic, of course.)

Ending in _list is part of a pattern with the class->component -->
component->component_list relationship. We can discuss whatever suffix is best on v2 - as
long as they all follow the same pattern!


Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Ben Horgan 5 months, 1 week ago

Hi James,

On 8/22/25 16:29, James Morse wrote:
> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> only be accessible from those CPUs, and they may not be online.
> Touching the hardware early is pointless as MPAM can't be used until
> the system-wide common values for num_partid and num_pmg have been
> discovered.
> 
> Start with driver probe/remove and mapping the MSC.
> 
> CC: Carl Worth <carl@os.amperecomputing.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since RFC:
>  * Check for status=broken DT devices.
>  * Moved all the files around.
>  * Made Kconfig symbols depend on EXPERT
> ---
>  arch/arm64/Kconfig              |   1 +
>  drivers/Kconfig                 |   2 +
>  drivers/Makefile                |   1 +
>  drivers/resctrl/Kconfig         |  11 ++
>  drivers/resctrl/Makefile        |   4 +
>  drivers/resctrl/mpam_devices.c  | 336 ++++++++++++++++++++++++++++++++
>  drivers/resctrl/mpam_internal.h |  62 ++++++
>  7 files changed, 417 insertions(+)
>  create mode 100644 drivers/resctrl/Kconfig
>  create mode 100644 drivers/resctrl/Makefile
>  create mode 100644 drivers/resctrl/mpam_devices.c
>  create mode 100644 drivers/resctrl/mpam_internal.h
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index e51ccf1da102..ea3c54e04275 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE
>  
>  config ARM64_MPAM
>  	bool "Enable support for MPAM"
> +	select ARM64_MPAM_DRIVER
>  	select ACPI_MPAM if ACPI
>  	help
>  	  Memory Partitioning and Monitoring is an optional extension
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index 4915a63866b0..3054b50a2f4c 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
>  
>  source "drivers/cdx/Kconfig"
>  
> +source "drivers/resctrl/Kconfig"
> +
>  endmenu
> diff --git a/drivers/Makefile b/drivers/Makefile
> index b5749cf67044..f41cf4eddeba 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -194,5 +194,6 @@ obj-$(CONFIG_HTE)		+= hte/
>  obj-$(CONFIG_DRM_ACCEL)		+= accel/
>  obj-$(CONFIG_CDX_BUS)		+= cdx/
>  obj-$(CONFIG_DPLL)		+= dpll/
> +obj-y				+= resctrl/
>  
>  obj-$(CONFIG_S390)		+= s390/
> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
> new file mode 100644
> index 000000000000..dff7b87280ab
> --- /dev/null
> +++ b/drivers/resctrl/Kconfig
> @@ -0,0 +1,11 @@
> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
> +# CPU resources, not containers or cgroups etc.
> +config ARM64_MPAM_DRIVER
> +	bool "MPAM driver for System IP, e,g. caches and memory controllers"
> +	depends on ARM64_MPAM && EXPERT
> +
> +config ARM64_MPAM_DRIVER_DEBUG
> +	bool "Enable debug messages from the MPAM driver."
> +	depends on ARM64_MPAM_DRIVER
> +	help
> +	  Say yes here to enable debug messages from the MPAM driver.
> diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
> new file mode 100644
> index 000000000000..92b48fa20108
> --- /dev/null
> +++ b/drivers/resctrl/Makefile
> @@ -0,0 +1,4 @@
> +obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
> +mpam-y						+= mpam_devices.o
> +
> +cflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> new file mode 100644
> index 000000000000..a0d9a699a6e7
> --- /dev/null
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2025 Arm Ltd.
> +
> +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
> +
> +#include <linux/acpi.h>
> +#include <linux/arm_mpam.h>
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/gfp.h>
> +#include <linux/list.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/srcu.h>
> +#include <linux/types.h>
> +
> +#include <acpi/pcc.h>
> +
> +#include "mpam_internal.h"
> +
> +/*
> + * mpam_list_lock protects the SRCU lists when writing. Once the
> + * mpam_enabled key is enabled these lists are read-only,
> + * unless the error interrupt disables the driver.
> + */
> +static DEFINE_MUTEX(mpam_list_lock);
> +static LIST_HEAD(mpam_all_msc);
> +
> +static struct srcu_struct mpam_srcu;
> +
> +/* MPAM isn't available until all the MSC have been probed. */
> +static u32 mpam_num_msc;
> +
> +static void mpam_discovery_complete(void)
> +{
> +	pr_err("Discovered all MSC\n");
> +}
> +
> +static int mpam_dt_count_msc(void)
> +{
> +	int count = 0;
> +	struct device_node *np;
> +
> +	for_each_compatible_node(np, NULL, "arm,mpam-msc") {
> +		if (of_device_is_available(np))
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
> +				  u32 ris_idx)
> +{
> +	int err = 0;
> +	u32 level = 0;
> +	unsigned long cache_id;
> +	struct device_node *cache;
> +
> +	do {
> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
> +			if (!cache) {
> +				pr_err("Failed to read phandle\n");
> +				break;
> +			}
> +		} else if (of_device_is_compatible(np->parent, "cache")) {
> +			cache = of_node_get(np->parent);
> +		} else {
> +			/* For now, only caches are supported */
> +			cache = NULL;
> +			break;
> +		}
> +
> +		err = of_property_read_u32(cache, "cache-level", &level);
> +		if (err) {
> +			pr_err("Failed to read cache-level\n");
> +			break;
> +		}
> +
> +		cache_id = cache_of_calculate_id(cache);
> +		if (cache_id == ~0UL) {
> +			err = -ENOENT;
> +			break;
> +		}
> +
> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
> +				      cache_id);
> +	} while (0);
> +	of_node_put(cache);
> +
> +	return err;
> +}
> +
> +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
> +{
> +	int err, num_ris = 0;
> +	const u32 *ris_idx_p;
> +	struct device_node *iter, *np;
> +
> +	np = msc->pdev->dev.of_node;
> +	for_each_child_of_node(np, iter) {
> +		ris_idx_p = of_get_property(iter, "reg", NULL);
> +		if (ris_idx_p) {
> +			num_ris++;
> +			err = mpam_dt_parse_resource(msc, iter, *ris_idx_p);
> +			if (err) {
> +				of_node_put(iter);
> +				return err;
> +			}
> +		}
> +	}
> +
> +	if (!num_ris)
> +		mpam_dt_parse_resource(msc, np, 0);
> +
> +	return err;
> +}
> +
> +/*
> + * An MSC can control traffic from a set of CPUs, but may only be accessible
> + * from a (hopefully wider) set of CPUs. The common reason for this is power
> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
> + * the corresponding cache may also be powered off. By making accesses from
> + * one of those CPUs, we ensure this isn't the case.
> + */
> +static int update_msc_accessibility(struct mpam_msc *msc)
> +{
> +	struct device_node *parent;
> +	u32 affinity_id;
> +	int err;
> +
> +	if (!acpi_disabled) {
> +		err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
> +					       &affinity_id);
> +		if (err)
> +			cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		else
> +			acpi_pptt_get_cpus_from_container(affinity_id,
> +							  &msc->accessibility);
> +
> +		return 0;
> +	}
> +
> +	/* This depends on the path to of_node */
> +	parent = of_get_parent(msc->pdev->dev.of_node);
> +	if (parent == of_root) {
> +		cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		err = 0;
> +	} else {
> +		err = -EINVAL;
> +		pr_err("Cannot determine accessibility of MSC: %s\n",
> +		       dev_name(&msc->pdev->dev));
> +	}
> +	of_node_put(parent);
> +
> +	return err;
> +}
> +
> +static int fw_num_msc;
> +
> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
> +{
> +	/* TODO: wake up tasks blocked on this MSC's PCC channel */
> +}
> +
> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> +{
> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
> +
> +	if (!msc)
> +		return;
> +
> +	mutex_lock(&mpam_list_lock);
> +	mpam_num_msc--;
> +	platform_set_drvdata(pdev, NULL);
> +	list_del_rcu(&msc->glbl_list);
> +	synchronize_srcu(&mpam_srcu);
> +	devm_kfree(&pdev->dev, msc);
> +	mutex_unlock(&mpam_list_lock);
> +}
> +
> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> +{
> +	int err;
> +	struct mpam_msc *msc;
> +	struct resource *msc_res;
> +	void *plat_data = pdev->dev.platform_data;
> +
> +	mutex_lock(&mpam_list_lock);
> +	do {
> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
> +		if (!msc) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +
> +		mutex_init(&msc->probe_lock);
> +		mutex_init(&msc->part_sel_lock);
> +		mutex_init(&msc->outer_mon_sel_lock);
> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
> +		msc->id = mpam_num_msc++;
> +		msc->pdev = pdev;
> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);
> +		INIT_LIST_HEAD_RCU(&msc->ris);
> +
> +		err = update_msc_accessibility(msc);
> +		if (err)
> +			break;
> +		if (cpumask_empty(&msc->accessibility)) {
> +			pr_err_once("msc:%u is not accessible from any CPU!",
> +				    msc->id);
> +			err = -EINVAL;
> +			break;
> +		}
> +
> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
> +					     &msc->pcc_subspace_id))
> +			msc->iface = MPAM_IFACE_MMIO;
> +		else
> +			msc->iface = MPAM_IFACE_PCC;
> +
> +		if (msc->iface == MPAM_IFACE_MMIO) {
> +			void __iomem *io;
> +
> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
> +								    &msc_res);
> +			if (IS_ERR(io)) {
> +				pr_err("Failed to map MSC base address\n");
> +				err = PTR_ERR(io);
> +				break;
> +			}
> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
> +			msc->mapped_hwpage = io;
> +		} else if (msc->iface == MPAM_IFACE_PCC) {
> +			msc->pcc_cl.dev = &pdev->dev;
> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
> +			msc->pcc_cl.tx_block = false;
> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
> +			msc->pcc_cl.knows_txdone = false;
> +
> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
> +								 msc->pcc_subspace_id);
> +			if (IS_ERR(msc->pcc_chan)) {
> +				pr_err("Failed to request MSC PCC channel\n");
> +				err = PTR_ERR(msc->pcc_chan);
> +				break;
> +			}
> +		}
> +
> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
> +		platform_set_drvdata(pdev, msc);
> +	} while (0);
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (!err) {
> +		/* Create RIS entries described by firmware */
> +		if (!acpi_disabled)
> +			err = acpi_mpam_parse_resources(msc, plat_data);
> +		else
> +			err = mpam_dt_parse_resources(msc, plat_data);
> +	}
> +
> +	if (!err && fw_num_msc == mpam_num_msc)
> +		mpam_discovery_complete();
> +
> +	if (err && msc)
> +		mpam_msc_drv_remove(pdev);
> +
> +	return err;
> +}
> +
> +static const struct of_device_id mpam_of_match[] = {
> +	{ .compatible = "arm,mpam-msc", },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, mpam_of_match);
> +
> +static struct platform_driver mpam_msc_driver = {
> +	.driver = {
> +		.name = "mpam_msc",
> +		.of_match_table = of_match_ptr(mpam_of_match),
> +	},
> +	.probe = mpam_msc_drv_probe,
> +	.remove = mpam_msc_drv_remove,
> +};
> +
> +/*
> + * MSC that are hidden under caches are not created as platform devices
> + * as there is no cache driver. Caches are also special-cased in
> + * update_msc_accessibility().
> + */
> +static void mpam_dt_create_foundling_msc(void)
> +{
> +	int err;
> +	struct device_node *cache;
> +
> +	for_each_compatible_node(cache, NULL, "cache") {
> +		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
> +		if (err)
> +			pr_err("Failed to create MSC devices under caches\n");
> +	}
> +}
> +
> +static int __init mpam_msc_driver_init(void)
> +{
> +	if (!system_supports_mpam())
> +		return -EOPNOTSUPP;
> +
> +	init_srcu_struct(&mpam_srcu);
> +
> +	if (!acpi_disabled)
> +		fw_num_msc = acpi_mpam_count_msc();
> +	else
> +		fw_num_msc = mpam_dt_count_msc();
> +
> +	if (fw_num_msc <= 0) {
> +		pr_err("No MSC devices found in firmware\n");
> +		return -EINVAL;
> +	}
> +
> +	if (acpi_disabled)
> +		mpam_dt_create_foundling_msc();
> +
> +	return platform_driver_register(&mpam_msc_driver);
> +}
> +subsys_initcall(mpam_msc_driver_init);
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> new file mode 100644
> index 000000000000..07e0f240eaca
> --- /dev/null
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +// Copyright (C) 2024 Arm Ltd.
> +
> +#ifndef MPAM_INTERNAL_H
> +#define MPAM_INTERNAL_H
> +
> +#include <linux/arm_mpam.h>
> +#include <linux/cpumask.h>
> +#include <linux/io.h>
> +#include <linux/mailbox_client.h>
> +#include <linux/mutex.h>
> +#include <linux/resctrl.h>
> +#include <linux/sizes.h>
> +
> +struct mpam_msc {
> +	/* member of mpam_all_msc */
> +	struct list_head        glbl_list;
> +
> +	int			id;
> +	struct platform_device *pdev;
> +
> +	/* Not modified after mpam_is_enabled() becomes true */
> +	enum mpam_msc_iface	iface;
> +	u32			pcc_subspace_id;
> +	struct mbox_client	pcc_cl;
> +	struct pcc_mbox_chan	*pcc_chan;
> +	u32			nrdy_usec;
> +	cpumask_t		accessibility;
> +
> +	/*
> +	 * probe_lock is only take during discovery. After discovery these
> +	 * properties become read-only and the lists are protected by SRCU.
> +	 */
> +	struct mutex		probe_lock;
> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];

Why is this sized this way? RIS_MAX is 4 bits and so there are at most
16 RIS per msc.

> +	u32			ris_max;
> +
> +	/* mpam_msc_ris of this component */
> +	struct list_head	ris;
> +
> +	/*
> +	 * part_sel_lock protects access to the MSC hardware registers that are
> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
> +	 * by RIS).
> +	 * If needed, take msc->lock first.
> +	 */
> +	struct mutex		part_sel_lock;
> +
> +	/*
> +	 * mon_sel_lock protects access to the MSC hardware registers that are
> +	 * affeted by MPAMCFG_MON_SEL.
> +	 * If needed, take msc->lock first.
> +	 */
> +	struct mutex		outer_mon_sel_lock;
> +	raw_spinlock_t		inner_mon_sel_lock;
> +	unsigned long		inner_mon_sel_flags;
> +
> +	void __iomem		*mapped_hwpage;
> +	size_t			mapped_hwpage_sz;
> +};
> +
> +#endif /* MPAM_INTERNAL_H */

Thanks,

Ben

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Ben,

On 01/09/2025 10:11, Ben Horgan wrote:
> On 8/22/25 16:29, James Morse wrote:
>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>> only be accessible from those CPUs, and they may not be online.
>> Touching the hardware early is pointless as MPAM can't be used until
>> the system-wide common values for num_partid and num_pmg have been
>> discovered.
>>
>> Start with driver probe/remove and mapping the MSC.

>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> new file mode 100644
>> index 000000000000..07e0f240eaca
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -0,0 +1,62 @@

>> +struct mpam_msc {
>> +	/* member of mpam_all_msc */
>> +	struct list_head        glbl_list;
>> +
>> +	int			id;
>> +	struct platform_device *pdev;
>> +
>> +	/* Not modified after mpam_is_enabled() becomes true */
>> +	enum mpam_msc_iface	iface;
>> +	u32			pcc_subspace_id;
>> +	struct mbox_client	pcc_cl;
>> +	struct pcc_mbox_chan	*pcc_chan;
>> +	u32			nrdy_usec;
>> +	cpumask_t		accessibility;
>> +
>> +	/*
>> +	 * probe_lock is only take during discovery. After discovery these
>> +	 * properties become read-only and the lists are protected by SRCU.
>> +	 */
>> +	struct mutex		probe_lock;
>> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];

> Why is this sized this way? RIS_MAX is 4 bits and so there are at most
> 16 RIS per msc.

Hmmm, lost in time - I agree with the 16 reasoning. Fixed.
(It's likely due to RES0 space above the field - but that has been filled in with other
 stuff since then. RIS was added as a 'backward compatible feature' - I was wary of them
 extending it)


>> +	u32			ris_max;
>> +
>> +	/* mpam_msc_ris of this component */
>> +	struct list_head	ris;
>> +
>> +	/*
>> +	 * part_sel_lock protects access to the MSC hardware registers that are
>> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>> +	 * by RIS).
>> +	 * If needed, take msc->lock first.
>> +	 */
>> +	struct mutex		part_sel_lock;
>> +
>> +	/*
>> +	 * mon_sel_lock protects access to the MSC hardware registers that are
>> +	 * affeted by MPAMCFG_MON_SEL.
>> +	 * If needed, take msc->lock first.
>> +	 */
>> +	struct mutex		outer_mon_sel_lock;
>> +	raw_spinlock_t		inner_mon_sel_lock;
>> +	unsigned long		inner_mon_sel_flags;
>> +
>> +	void __iomem		*mapped_hwpage;
>> +	size_t			mapped_hwpage_sz;
>> +};
>> +
>> +#endif /* MPAM_INTERNAL_H */

Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Rob Herring 5 months, 2 weeks ago

On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
>
> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> only be accessible from those CPUs, and they may not be online.
> Touching the hardware early is pointless as MPAM can't be used until
> the system-wide common values for num_partid and num_pmg have been
> discovered.
>
> Start with driver probe/remove and mapping the MSC.
>
> CC: Carl Worth <carl@os.amperecomputing.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since RFC:
>  * Check for status=broken DT devices.

No such status... 'disabled' can be for a variety of reasons.

>  * Moved all the files around.
>  * Made Kconfig symbols depend on EXPERT
> ---
>  arch/arm64/Kconfig              |   1 +
>  drivers/Kconfig                 |   2 +
>  drivers/Makefile                |   1 +
>  drivers/resctrl/Kconfig         |  11 ++
>  drivers/resctrl/Makefile        |   4 +
>  drivers/resctrl/mpam_devices.c  | 336 ++++++++++++++++++++++++++++++++
>  drivers/resctrl/mpam_internal.h |  62 ++++++
>  7 files changed, 417 insertions(+)
>  create mode 100644 drivers/resctrl/Kconfig
>  create mode 100644 drivers/resctrl/Makefile
>  create mode 100644 drivers/resctrl/mpam_devices.c
>  create mode 100644 drivers/resctrl/mpam_internal.h
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index e51ccf1da102..ea3c54e04275 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE
>
>  config ARM64_MPAM
>         bool "Enable support for MPAM"
> +       select ARM64_MPAM_DRIVER
>         select ACPI_MPAM if ACPI
>         help
>           Memory Partitioning and Monitoring is an optional extension
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index 4915a63866b0..3054b50a2f4c 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
>
>  source "drivers/cdx/Kconfig"
>
> +source "drivers/resctrl/Kconfig"
> +
>  endmenu
> diff --git a/drivers/Makefile b/drivers/Makefile
> index b5749cf67044..f41cf4eddeba 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -194,5 +194,6 @@ obj-$(CONFIG_HTE)           += hte/
>  obj-$(CONFIG_DRM_ACCEL)                += accel/
>  obj-$(CONFIG_CDX_BUS)          += cdx/
>  obj-$(CONFIG_DPLL)             += dpll/
> +obj-y                          += resctrl/
>
>  obj-$(CONFIG_S390)             += s390/
> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
> new file mode 100644
> index 000000000000..dff7b87280ab
> --- /dev/null
> +++ b/drivers/resctrl/Kconfig
> @@ -0,0 +1,11 @@
> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
> +# CPU resources, not containers or cgroups etc.
> +config ARM64_MPAM_DRIVER
> +       bool "MPAM driver for System IP, e,g. caches and memory controllers"
> +       depends on ARM64_MPAM && EXPERT
> +
> +config ARM64_MPAM_DRIVER_DEBUG
> +       bool "Enable debug messages from the MPAM driver."
> +       depends on ARM64_MPAM_DRIVER
> +       help
> +         Say yes here to enable debug messages from the MPAM driver.
> diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
> new file mode 100644
> index 000000000000..92b48fa20108
> --- /dev/null
> +++ b/drivers/resctrl/Makefile
> @@ -0,0 +1,4 @@
> +obj-$(CONFIG_ARM64_MPAM_DRIVER)                        += mpam.o
> +mpam-y                                         += mpam_devices.o
> +
> +cflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)       += -DDEBUG
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> new file mode 100644
> index 000000000000..a0d9a699a6e7
> --- /dev/null
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2025 Arm Ltd.

Given the 2024 below, should this be 2024-2025?

> +
> +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
> +
> +#include <linux/acpi.h>
> +#include <linux/arm_mpam.h>
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/gfp.h>
> +#include <linux/list.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/srcu.h>
> +#include <linux/types.h>
> +
> +#include <acpi/pcc.h>
> +
> +#include "mpam_internal.h"
> +
> +/*
> + * mpam_list_lock protects the SRCU lists when writing. Once the
> + * mpam_enabled key is enabled these lists are read-only,
> + * unless the error interrupt disables the driver.
> + */
> +static DEFINE_MUTEX(mpam_list_lock);
> +static LIST_HEAD(mpam_all_msc);
> +
> +static struct srcu_struct mpam_srcu;
> +
> +/* MPAM isn't available until all the MSC have been probed. */
> +static u32 mpam_num_msc;
> +
> +static void mpam_discovery_complete(void)
> +{
> +       pr_err("Discovered all MSC\n");

Perhaps print out how many MSCs.

> +}
> +
> +static int mpam_dt_count_msc(void)
> +{
> +       int count = 0;
> +       struct device_node *np;
> +
> +       for_each_compatible_node(np, NULL, "arm,mpam-msc") {
> +               if (of_device_is_available(np))
> +                       count++;
> +       }
> +
> +       return count;
> +}
> +
> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
> +                                 u32 ris_idx)
> +{
> +       int err = 0;
> +       u32 level = 0;
> +       unsigned long cache_id;
> +       struct device_node *cache;
> +
> +       do {
> +               if (of_device_is_compatible(np, "arm,mpam-cache")) {
> +                       cache = of_parse_phandle(np, "arm,mpam-device", 0);
> +                       if (!cache) {
> +                               pr_err("Failed to read phandle\n");
> +                               break;
> +                       }
> +               } else if (of_device_is_compatible(np->parent, "cache")) {

Don't access device_node members. I'm trying to make it opaque. And
technically it can be racy to access parent ptr when/if nodes are
dynamic. I think this should suffice:

else {
  cache = of_get_parent(np);
  if (!of_device_is_compatible(cache, "cache")) {
    cache = NULL;
    break;
  }
}

> +                       cache = of_node_get(np->parent);
> +               } else {
> +                       /* For now, only caches are supported */
> +                       cache = NULL;
> +                       break;
> +               }
> +
> +               err = of_property_read_u32(cache, "cache-level", &level);
> +               if (err) {
> +                       pr_err("Failed to read cache-level\n");
> +                       break;
> +               }
> +
> +               cache_id = cache_of_calculate_id(cache);
> +               if (cache_id == ~0UL) {
> +                       err = -ENOENT;
> +                       break;
> +               }
> +
> +               err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
> +                                     cache_id);
> +       } while (0);
> +       of_node_put(cache);
> +
> +       return err;
> +}
> +
> +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
> +{
> +       int err, num_ris = 0;
> +       const u32 *ris_idx_p;
> +       struct device_node *iter, *np;
> +
> +       np = msc->pdev->dev.of_node;
> +       for_each_child_of_node(np, iter) {

Use for_each_available_child_of_node_scoped()

> +               ris_idx_p = of_get_property(iter, "reg", NULL);

This is broken on big endian and new users of of_get_property() are
discouraged. Use of_property_read_reg().

> +               if (ris_idx_p) {
> +                       num_ris++;
> +                       err = mpam_dt_parse_resource(msc, iter, *ris_idx_p);
> +                       if (err) {
> +                               of_node_put(iter);

And then drop the put.

> +                               return err;
> +                       }
> +               }
> +       }
> +
> +       if (!num_ris)
> +               mpam_dt_parse_resource(msc, np, 0);
> +
> +       return err;
> +}
> +
> +/*
> + * An MSC can control traffic from a set of CPUs, but may only be accessible
> + * from a (hopefully wider) set of CPUs. The common reason for this is power
> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
> + * the corresponding cache may also be powered off. By making accesses from
> + * one of those CPUs, we ensure this isn't the case.
> + */
> +static int update_msc_accessibility(struct mpam_msc *msc)
> +{
> +       struct device_node *parent;
> +       u32 affinity_id;
> +       int err;
> +
> +       if (!acpi_disabled) {
> +               err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
> +                                              &affinity_id);
> +               if (err)
> +                       cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +               else
> +                       acpi_pptt_get_cpus_from_container(affinity_id,
> +                                                         &msc->accessibility);
> +
> +               return 0;
> +       }
> +
> +       /* This depends on the path to of_node */

I'm failing to understand what has to be at the root node?

> +       parent = of_get_parent(msc->pdev->dev.of_node);
> +       if (parent == of_root) {
> +               cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +               err = 0;
> +       } else {
> +               err = -EINVAL;
> +               pr_err("Cannot determine accessibility of MSC: %s\n",
> +                      dev_name(&msc->pdev->dev));
> +       }
> +       of_node_put(parent);
> +
> +       return err;
> +}
> +
> +static int fw_num_msc;
> +
> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
> +{
> +       /* TODO: wake up tasks blocked on this MSC's PCC channel */
> +}
> +
> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> +{
> +       struct mpam_msc *msc = platform_get_drvdata(pdev);
> +
> +       if (!msc)
> +               return;
> +
> +       mutex_lock(&mpam_list_lock);
> +       mpam_num_msc--;
> +       platform_set_drvdata(pdev, NULL);
> +       list_del_rcu(&msc->glbl_list);
> +       synchronize_srcu(&mpam_srcu);
> +       devm_kfree(&pdev->dev, msc);

This should happen automagically.

> +       mutex_unlock(&mpam_list_lock);
> +}
> +
> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> +{
> +       int err;
> +       struct mpam_msc *msc;
> +       struct resource *msc_res;
> +       void *plat_data = pdev->dev.platform_data;
> +
> +       mutex_lock(&mpam_list_lock);
> +       do {
> +               msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
> +               if (!msc) {
> +                       err = -ENOMEM;
> +                       break;
> +               }
> +
> +               mutex_init(&msc->probe_lock);
> +               mutex_init(&msc->part_sel_lock);
> +               mutex_init(&msc->outer_mon_sel_lock);
> +               raw_spin_lock_init(&msc->inner_mon_sel_lock);
> +               msc->id = mpam_num_msc++;

Multiple probe functions can run in parallel, so this needs to be
atomic. Maybe it is with mpam_list_lock, but then the name of the
mutex is misleading given this is not the list. It's not really clear
to me what all needs the mutex here. Certainly a lot of it doesn't.
Like everything else above here except the increment.

> +               msc->pdev = pdev;
> +               INIT_LIST_HEAD_RCU(&msc->glbl_list);
> +               INIT_LIST_HEAD_RCU(&msc->ris);
> +
> +               err = update_msc_accessibility(msc);
> +               if (err)
> +                       break;
> +               if (cpumask_empty(&msc->accessibility)) {
> +                       pr_err_once("msc:%u is not accessible from any CPU!",
> +                                   msc->id);
> +                       err = -EINVAL;
> +                       break;
> +               }
> +
> +               if (device_property_read_u32(&pdev->dev, "pcc-channel",

Does this property apply to DT? It would as the code is written. It is
not documented though.

> +                                            &msc->pcc_subspace_id))
> +                       msc->iface = MPAM_IFACE_MMIO;
> +               else
> +                       msc->iface = MPAM_IFACE_PCC;
> +
> +               if (msc->iface == MPAM_IFACE_MMIO) {
> +                       void __iomem *io;
> +
> +                       io = devm_platform_get_and_ioremap_resource(pdev, 0,
> +                                                                   &msc_res);
> +                       if (IS_ERR(io)) {
> +                               pr_err("Failed to map MSC base address\n");
> +                               err = PTR_ERR(io);
> +                               break;
> +                       }
> +                       msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
> +                       msc->mapped_hwpage = io;
> +               } else if (msc->iface == MPAM_IFACE_PCC) {
> +                       msc->pcc_cl.dev = &pdev->dev;
> +                       msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
> +                       msc->pcc_cl.tx_block = false;
> +                       msc->pcc_cl.tx_tout = 1000; /* 1s */
> +                       msc->pcc_cl.knows_txdone = false;
> +
> +                       msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
> +                                                                msc->pcc_subspace_id);
> +                       if (IS_ERR(msc->pcc_chan)) {
> +                               pr_err("Failed to request MSC PCC channel\n");
> +                               err = PTR_ERR(msc->pcc_chan);
> +                               break;
> +                       }
> +               }
> +
> +               list_add_rcu(&msc->glbl_list, &mpam_all_msc);
> +               platform_set_drvdata(pdev, msc);
> +       } while (0);
> +       mutex_unlock(&mpam_list_lock);
> +
> +       if (!err) {
> +               /* Create RIS entries described by firmware */
> +               if (!acpi_disabled)
> +                       err = acpi_mpam_parse_resources(msc, plat_data);
> +               else
> +                       err = mpam_dt_parse_resources(msc, plat_data);

Isn't there a race here if an error occurs since you already added the
MSC to the list? Something like this sequence with 2 MSCs:

device 1 probe
device 1 added
    device 2 probe
    device 2 added
device 1 calls mpam_discovery_complete()
    device 2 error on parse_resources
    device 2 removed

> +       }
> +
> +       if (!err && fw_num_msc == mpam_num_msc)
> +               mpam_discovery_complete();
> +
> +       if (err && msc)
> +               mpam_msc_drv_remove(pdev);
> +
> +       return err;
> +}
> +
> +static const struct of_device_id mpam_of_match[] = {
> +       { .compatible = "arm,mpam-msc", },
> +       {},
> +};
> +MODULE_DEVICE_TABLE(of, mpam_of_match);
> +
> +static struct platform_driver mpam_msc_driver = {
> +       .driver = {
> +               .name = "mpam_msc",
> +               .of_match_table = of_match_ptr(mpam_of_match),
> +       },
> +       .probe = mpam_msc_drv_probe,
> +       .remove = mpam_msc_drv_remove,
> +};
> +
> +/*
> + * MSC that are hidden under caches are not created as platform devices
> + * as there is no cache driver. Caches are also special-cased in
> + * update_msc_accessibility().
> + */
> +static void mpam_dt_create_foundling_msc(void)
> +{
> +       int err;
> +       struct device_node *cache;
> +
> +       for_each_compatible_node(cache, NULL, "cache") {
> +               err = of_platform_populate(cache, mpam_of_match, NULL, NULL);

This is going to create platform devices for all caches (except L1)
regardless of whether they support MPAM or not. Isn't it likely or
possible that only L3 or SLC caches support MPAM?

> +               if (err)
> +                       pr_err("Failed to create MSC devices under caches\n");
> +       }
> +}
> +
> +static int __init mpam_msc_driver_init(void)
> +{
> +       if (!system_supports_mpam())
> +               return -EOPNOTSUPP;
> +
> +       init_srcu_struct(&mpam_srcu);
> +
> +       if (!acpi_disabled)
> +               fw_num_msc = acpi_mpam_count_msc();
> +       else
> +               fw_num_msc = mpam_dt_count_msc();
> +
> +       if (fw_num_msc <= 0) {
> +               pr_err("No MSC devices found in firmware\n");
> +               return -EINVAL;
> +       }
> +
> +       if (acpi_disabled)
> +               mpam_dt_create_foundling_msc();
> +
> +       return platform_driver_register(&mpam_msc_driver);
> +}
> +subsys_initcall(mpam_msc_driver_init);
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> new file mode 100644
> index 000000000000..07e0f240eaca
> --- /dev/null
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +// Copyright (C) 2024 Arm Ltd.

It's 2025.

> +
> +#ifndef MPAM_INTERNAL_H
> +#define MPAM_INTERNAL_H
> +
> +#include <linux/arm_mpam.h>
> +#include <linux/cpumask.h>
> +#include <linux/io.h>
> +#include <linux/mailbox_client.h>
> +#include <linux/mutex.h>
> +#include <linux/resctrl.h>
> +#include <linux/sizes.h>
> +
> +struct mpam_msc {
> +       /* member of mpam_all_msc */
> +       struct list_head        glbl_list;
> +
> +       int                     id;
> +       struct platform_device *pdev;
> +
> +       /* Not modified after mpam_is_enabled() becomes true */
> +       enum mpam_msc_iface     iface;
> +       u32                     pcc_subspace_id;
> +       struct mbox_client      pcc_cl;
> +       struct pcc_mbox_chan    *pcc_chan;
> +       u32                     nrdy_usec;
> +       cpumask_t               accessibility;
> +
> +       /*
> +        * probe_lock is only take during discovery. After discovery these

s/take/taken/

> +        * properties become read-only and the lists are protected by SRCU.
> +        */
> +       struct mutex            probe_lock;
> +       unsigned long           ris_idxs[128 / BITS_PER_LONG];
> +       u32                     ris_max;
> +
> +       /* mpam_msc_ris of this component */
> +       struct list_head        ris;
> +
> +       /*
> +        * part_sel_lock protects access to the MSC hardware registers that are
> +        * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
> +        * by RIS).
> +        * If needed, take msc->lock first.

Stale comment? I don't see any 'lock' member.

> +        */
> +       struct mutex            part_sel_lock;
> +
> +       /*
> +        * mon_sel_lock protects access to the MSC hardware registers that are
> +        * affeted by MPAMCFG_MON_SEL.
> +        * If needed, take msc->lock first.
> +        */
> +       struct mutex            outer_mon_sel_lock;
> +       raw_spinlock_t          inner_mon_sel_lock;
> +       unsigned long           inner_mon_sel_flags;
> +
> +       void __iomem            *mapped_hwpage;
> +       size_t                  mapped_hwpage_sz;
> +};
> +
> +#endif /* MPAM_INTERNAL_H */
> --
> 2.20.1
>

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Rob,

On 27/08/2025 16:39, Rob Herring wrote:
> On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
>>
>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>> only be accessible from those CPUs, and they may not be online.
>> Touching the hardware early is pointless as MPAM can't be used until
>> the system-wide common values for num_partid and num_pmg have been
>> discovered.
>>
>> Start with driver probe/remove and mapping the MSC.


>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>> new file mode 100644
>> index 000000000000..a0d9a699a6e7
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_devices.c
>> @@ -0,0 +1,336 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +// Copyright (C) 2025 Arm Ltd.
>
> Given the 2024 below, should this be 2024-2025?

I've never known what this year is really for. People clearly expect it to be this year
... I've evidently missed one somewhere. I'll fix that. ... I wrote some of this code in
2018, so there are a range of options on what it 'should' be ...


>> +
>> +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
>> +
>> +#include <linux/acpi.h>
>> +#include <linux/arm_mpam.h>
>> +#include <linux/cacheinfo.h>
>> +#include <linux/cpu.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/device.h>
>> +#include <linux/errno.h>
>> +#include <linux/gfp.h>
>> +#include <linux/list.h>
>> +#include <linux/lockdep.h>
>> +#include <linux/mutex.h>
>> +#include <linux/of.h>
>> +#include <linux/of_platform.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/printk.h>
>> +#include <linux/slab.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/srcu.h>
>> +#include <linux/types.h>
>> +
>> +#include <acpi/pcc.h>
>> +
>> +#include "mpam_internal.h"
>> +
>> +/*
>> + * mpam_list_lock protects the SRCU lists when writing. Once the
>> + * mpam_enabled key is enabled these lists are read-only,
>> + * unless the error interrupt disables the driver.
>> + */
>> +static DEFINE_MUTEX(mpam_list_lock);
>> +static LIST_HEAD(mpam_all_msc);
>> +
>> +static struct srcu_struct mpam_srcu;
>> +
>> +/* MPAM isn't available until all the MSC have been probed. */
>> +static u32 mpam_num_msc;
>> +
>> +static void mpam_discovery_complete(void)
>> +{
>> +       pr_err("Discovered all MSC\n");

> Perhaps print out how many MSCs.
Once the whole thing is assembled the mpam_enable() call prints the number of PARTID/PMG,
which is something user-space can do something with. I don't think the number of MSC is
useful to anyone as some of them are grouped together and can't be configured independently.


>> +}

>> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
>> +                                 u32 ris_idx)
>> +{
>> +       int err = 0;
>> +       u32 level = 0;
>> +       unsigned long cache_id;
>> +       struct device_node *cache;
>> +
>> +       do {
>> +               if (of_device_is_compatible(np, "arm,mpam-cache")) {
>> +                       cache = of_parse_phandle(np, "arm,mpam-device", 0);
>> +                       if (!cache) {
>> +                               pr_err("Failed to read phandle\n");
>> +                               break;
>> +                       }
>> +               } else if (of_device_is_compatible(np->parent, "cache")) {
>
> Don't access device_node members. I'm trying to make it opaque. And
> technically it can be racy to access parent ptr when/if nodes are
> dynamic. I think this should suffice:
>
> else {
>   cache = of_get_parent(np);
>   if (!of_device_is_compatible(cache, "cache")) {
>     cache = NULL;
>     break;
>   }
> }

Thanks!

The if/else-if ladder needs to stay, I've grabbed the parent earlier, and added a
of_node_put() of it after the do/while.


>> +                       cache = of_node_get(np->parent);
>> +               } else {
>> +                       /* For now, only caches are supported */
>> +                       cache = NULL;
>> +                       break;
>> +               }
>> +
>> +               err = of_property_read_u32(cache, "cache-level", &level);
>> +               if (err) {
>> +                       pr_err("Failed to read cache-level\n");
>> +                       break;
>> +               }
>> +
>> +               cache_id = cache_of_calculate_id(cache);
>> +               if (cache_id == ~0UL) {
>> +                       err = -ENOENT;
>> +                       break;
>> +               }
>> +
>> +               err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
>> +                                     cache_id);
>> +       } while (0);
>> +       of_node_put(cache);
>> +
>> +       return err;
>> +}
>> +
>> +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
>> +{
>> +       int err, num_ris = 0;
>> +       const u32 *ris_idx_p;
>> +       struct device_node *iter, *np;
>> +
>> +       np = msc->pdev->dev.of_node;
>> +       for_each_child_of_node(np, iter) {
>
> Use for_each_available_child_of_node_scoped()

Sure,


>> +               ris_idx_p = of_get_property(iter, "reg", NULL);
>
> This is broken on big endian and new users of of_get_property() are
> discouraged. Use of_property_read_reg().

Done,


>> +               if (ris_idx_p) {
>> +                       num_ris++;
>> +                       err = mpam_dt_parse_resource(msc, iter, *ris_idx_p);
>> +                       if (err) {
>> +                               of_node_put(iter);
>
> And then drop the put.
>
>> +                               return err;
>> +                       }
>> +               }
>> +       }
>> +
>> +       if (!num_ris)
>> +               mpam_dt_parse_resource(msc, np, 0);
>> +
>> +       return err;
>> +}


>> +/*
>> + * An MSC can control traffic from a set of CPUs, but may only be accessible
>> + * from a (hopefully wider) set of CPUs. The common reason for this is power
>> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
>> + * the corresponding cache may also be powered off. By making accesses from
>> + * one of those CPUs, we ensure this isn't the case.
>> + */
>> +static int update_msc_accessibility(struct mpam_msc *msc)
>> +{
>> +       struct device_node *parent;
>> +       u32 affinity_id;
>> +       int err;
>> +
>> +       if (!acpi_disabled) {
>> +               err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
>> +                                              &affinity_id);
>> +               if (err)
>> +                       cpumask_copy(&msc->accessibility, cpu_possible_mask);
>> +               else
>> +                       acpi_pptt_get_cpus_from_container(affinity_id,
>> +                                                         &msc->accessibility);
>> +
>> +               return 0;
>> +       }
>> +
>> +       /* This depends on the path to of_node */

> I'm failing to understand what has to be at the root node?

The accessibility bitmap. If the MSC is at the root of the tree - its assumed to be
global. If its buried in a device, its assumed to be in the same power domain as that
device. Practically this only matters for caches where PSCI's CPU_SUSPEND can turn the
cache off, so the cache MSC can only be accessed from the CPU's local to it, that way we
know its not about to be turned off by PSCI.

I'll rephrase the comment - its trying to explain why its not explicitly encoded.
| /* Where an MSC can be accessed from depends on the path to of_node. */


>> +       parent = of_get_parent(msc->pdev->dev.of_node);
>> +       if (parent == of_root) {
>> +               cpumask_copy(&msc->accessibility, cpu_possible_mask);
>> +               err = 0;
>> +       } else {
>> +               err = -EINVAL;
>> +               pr_err("Cannot determine accessibility of MSC: %s\n",
>> +                      dev_name(&msc->pdev->dev));
>> +       }
>> +       of_node_put(parent);
>> +
>> +       return err;
>> +}
>> +static void mpam_msc_drv_remove(struct platform_device *pdev)
>> +{
>> +       struct mpam_msc *msc = platform_get_drvdata(pdev);
>> +
>> +       if (!msc)
>> +               return;
>> +
>> +       mutex_lock(&mpam_list_lock);
>> +       mpam_num_msc--;
>> +       platform_set_drvdata(pdev, NULL);
>> +       list_del_rcu(&msc->glbl_list);
>> +       synchronize_srcu(&mpam_srcu);
>> +       devm_kfree(&pdev->dev, msc);
>
> This should happen automagically.
>
>> +       mutex_unlock(&mpam_list_lock);
>> +}
>> +
>> +static int mpam_msc_drv_probe(struct platform_device *pdev)
>> +{
>> +       int err;
>> +       struct mpam_msc *msc;
>> +       struct resource *msc_res;
>> +       void *plat_data = pdev->dev.platform_data;
>> +
>> +       mutex_lock(&mpam_list_lock);
>> +       do {
>> +               msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
>> +               if (!msc) {
>> +                       err = -ENOMEM;
>> +                       break;
>> +               }
>> +
>> +               mutex_init(&msc->probe_lock);
>> +               mutex_init(&msc->part_sel_lock);
>> +               mutex_init(&msc->outer_mon_sel_lock);
>> +               raw_spin_lock_init(&msc->inner_mon_sel_lock);
>> +               msc->id = mpam_num_msc++;
>
> Multiple probe functions can run in parallel, so this needs to be
> atomic. Maybe it is with mpam_list_lock, but then the name of the
> mutex is misleading given this is not the list. It's not really clear
> to me what all needs the mutex here. Certainly a lot of it doesn't.
> Like everything else above here except the increment.

It's more about the class/component/device lists that get added later - but more on this
mpam_num_msc thing below...


>> +               msc->pdev = pdev;
>> +               INIT_LIST_HEAD_RCU(&msc->glbl_list);
>> +               INIT_LIST_HEAD_RCU(&msc->ris);
>> +
>> +               err = update_msc_accessibility(msc);
>> +               if (err)
>> +                       break;
>> +               if (cpumask_empty(&msc->accessibility)) {
>> +                       pr_err_once("msc:%u is not accessible from any CPU!",
>> +                                   msc->id);
>> +                       err = -EINVAL;
>> +                       break;
>> +               }
>> +
>> +               if (device_property_read_u32(&pdev->dev, "pcc-channel",

> Does this property apply to DT? It would as the code is written. It is
> not documented though.

I don't think so - on DT PCC is going to be spelled SCMI which somes with some kind of
discovery instead. This property is added by the ACPI table 'driver'.


>> +                                            &msc->pcc_subspace_id))
>> +                       msc->iface = MPAM_IFACE_MMIO;
>> +               else
>> +                       msc->iface = MPAM_IFACE_PCC;
>> +
>> +               if (msc->iface == MPAM_IFACE_MMIO) {
>> +                       void __iomem *io;
>> +
>> +                       io = devm_platform_get_and_ioremap_resource(pdev, 0,
>> +                                                                   &msc_res);
>> +                       if (IS_ERR(io)) {
>> +                               pr_err("Failed to map MSC base address\n");
>> +                               err = PTR_ERR(io);
>> +                               break;
>> +                       }
>> +                       msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
>> +                       msc->mapped_hwpage = io;
>> +               } else if (msc->iface == MPAM_IFACE_PCC) {
>> +                       msc->pcc_cl.dev = &pdev->dev;
>> +                       msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
>> +                       msc->pcc_cl.tx_block = false;
>> +                       msc->pcc_cl.tx_tout = 1000; /* 1s */
>> +                       msc->pcc_cl.knows_txdone = false;
>> +
>> +                       msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
>> +                                                                msc->pcc_subspace_id);
>> +                       if (IS_ERR(msc->pcc_chan)) {
>> +                               pr_err("Failed to request MSC PCC channel\n");
>> +                               err = PTR_ERR(msc->pcc_chan);
>> +                               break;
>> +                       }
>> +               }
>> +
>> +               list_add_rcu(&msc->glbl_list, &mpam_all_msc);
>> +               platform_set_drvdata(pdev, msc);
>> +       } while (0);
>> +       mutex_unlock(&mpam_list_lock);
>> +
>> +       if (!err) {
>> +               /* Create RIS entries described by firmware */
>> +               if (!acpi_disabled)
>> +                       err = acpi_mpam_parse_resources(msc, plat_data);
>> +               else
>> +                       err = mpam_dt_parse_resources(msc, plat_data);
>
> Isn't there a race here if an error occurs since you already added the
> MSC to the list? Something like this sequence with 2 MSCs:
>
> device 1 probe
> device 1 added
>     device 2 probe
>     device 2 added
> device 1 calls mpam_discovery_complete()
>     device 2 error on parse_resources
>     device 2 removed

By the time the whole thing is assembled the 'discovery complete' work is scheduled by a
cpuhp callback and has better protection against this. That discovery-complete call is
just to help illustrate that there is stuff that happens once all the MSC have been
discovered, as that code gets much more complicated later in teh series.

Combined with your comment about the msc_id increment above - I'll stop using that as both
a count and an id, re-using the pdev->id as its more likely to be stable from boot to
boot. mpam_num_msc can then become an atomic_t that is incremented once we know we're not
going to remove the MSC from the list.



>> +       }
>> +
>> +       if (!err && fw_num_msc == mpam_num_msc)
>> +               mpam_discovery_complete();
>> +
>> +       if (err && msc)
>> +               mpam_msc_drv_remove(pdev);
>> +
>> +       return err;
>> +}
>> +/*
>> + * MSC that are hidden under caches are not created as platform devices
>> + * as there is no cache driver. Caches are also special-cased in
>> + * update_msc_accessibility().
>> + */
>> +static void mpam_dt_create_foundling_msc(void)
>> +{
>> +       int err;
>> +       struct device_node *cache;
>> +
>> +       for_each_compatible_node(cache, NULL, "cache") {
>> +               err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
>
> This is going to create platform devices for all caches (except L1)
> regardless of whether they support MPAM or not. Isn't it likely or
> possible that only L3 or SLC caches support MPAM?

Likely - but all things are possible. You could put an MPAM MSC in your L1-I cache.
(or even in the CPU - but lets not go there)

I'll attempt to fix that up, what I have doesn't quite work, but I'll keep picking at it:
|        for_each_compatible_node(cache, NULL, "cache") {
|               struct device_node *cache_device;
|
|               if (of_node_check_flag(cache, OF_POPULATED))
|                       continue;
|
|               cache_device = of_find_matching_node_and_match(cache, mpam_of_match, NULL);
|               if (!cache_device)
|                       continue;
|               of_node_put(cache_device);
|
|               pdev = of_platform_device_create(cache, "cache", NULL);
|               if (!pdev)
|                       pr_err_once("Failed to create MSC devices under caches\n");
|          }


>> +               if (err)
>> +                       pr_err("Failed to create MSC devices under caches\n");
>> +       }
>> +}
>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> new file mode 100644
>> index 000000000000..07e0f240eaca
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -0,0 +1,62 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +// Copyright (C) 2024 Arm Ltd.
>
> It's 2025.
... and I wrote this in 2018, but missed the annual update. I'll fix it.


>> +
>> +struct mpam_msc {
>> +       /* member of mpam_all_msc */
>> +       struct list_head        glbl_list;
>> +
>> +       int                     id;
>> +       struct platform_device *pdev;
>> +
>> +       /* Not modified after mpam_is_enabled() becomes true */
>> +       enum mpam_msc_iface     iface;
>> +       u32                     pcc_subspace_id;
>> +       struct mbox_client      pcc_cl;
>> +       struct pcc_mbox_chan    *pcc_chan;
>> +       u32                     nrdy_usec;
>> +       cpumask_t               accessibility;
>> +
>> +       /*
>> +        * probe_lock is only take during discovery. After discovery these

> s/take/taken/

Fixed,


>> +        * properties become read-only and the lists are protected by SRCU.
>> +        */
>> +       struct mutex            probe_lock;
>> +       unsigned long           ris_idxs[128 / BITS_PER_LONG];
>> +       u32                     ris_max;
>> +
>> +       /* mpam_msc_ris of this component */
>> +       struct list_head        ris;
>> +
>> +       /*
>> +        * part_sel_lock protects access to the MSC hardware registers that are
>> +        * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>> +        * by RIS).
>> +        * If needed, take msc->lock first.

> Stale comment? I don't see any 'lock' member.

Yes, it should say probe_lock .. renamed after more locks got added in here, and the
comment got fixed up in the wrong patch. Fixed.


>> +        */
>> +       struct mutex            part_sel_lock;
>> +
>> +       /*
>> +        * mon_sel_lock protects access to the MSC hardware registers that are
>> +        * affeted by MPAMCFG_MON_SEL.
>> +        * If needed, take msc->lock first.
>> +        */
>> +       struct mutex            outer_mon_sel_lock;
>> +       raw_spinlock_t          inner_mon_sel_lock;
>> +       unsigned long           inner_mon_sel_flags;
>> +
>> +       void __iomem            *mapped_hwpage;
>> +       size_t                  mapped_hwpage_sz;
>> +};
>> +
>> +#endif /* MPAM_INTERNAL_H */
>> --
>> 2.20.1
>>

Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Rob Herring 5 months, 2 weeks ago

On Wed, Aug 27, 2025 at 10:39 AM Rob Herring <robh@kernel.org> wrote:
>
> On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
> >
> > Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> > only be accessible from those CPUs, and they may not be online.
> > Touching the hardware early is pointless as MPAM can't be used until
> > the system-wide common values for num_partid and num_pmg have been
> > discovered.

[...]

> > +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
> > +{
> > +       int err, num_ris = 0;
> > +       const u32 *ris_idx_p;
> > +       struct device_node *iter, *np;
> > +
> > +       np = msc->pdev->dev.of_node;
> > +       for_each_child_of_node(np, iter) {
>
> Use for_each_available_child_of_node_scoped()
>
> > +               ris_idx_p = of_get_property(iter, "reg", NULL);
>
> This is broken on big endian and new users of of_get_property() are
> discouraged. Use of_property_read_reg().

Err, this is broken on little endian as the DT is big endian.

So this was obviously not tested as I'm confident you didn't test on BE.

Rob

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Rob,

On 27/08/2025 17:16, Rob Herring wrote:
> On Wed, Aug 27, 2025 at 10:39 AM Rob Herring <robh@kernel.org> wrote:
>>
>> On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
>>>
>>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>>> only be accessible from those CPUs, and they may not be online.
>>> Touching the hardware early is pointless as MPAM can't be used until
>>> the system-wide common values for num_partid and num_pmg have been
>>> discovered.
>
> [...]
>
>>> +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
>>> +{
>>> +       int err, num_ris = 0;
>>> +       const u32 *ris_idx_p;
>>> +       struct device_node *iter, *np;
>>> +
>>> +       np = msc->pdev->dev.of_node;
>>> +       for_each_child_of_node(np, iter) {
>>
>> Use for_each_available_child_of_node_scoped()
>>
>>> +               ris_idx_p = of_get_property(iter, "reg", NULL);
>>
>> This is broken on big endian and new users of of_get_property() are
>> discouraged. Use of_property_read_reg().
>
> Err, this is broken on little endian as the DT is big endian.
>
> So this was obviously not tested as I'm confident you didn't test on BE.

'not tested' is shades of grey. I fed the FVP ~6 different DTB files to hit the different
paths through the driver. The FVP only has controls under RIS-0, so all of those only
defined RIS-0, and unsurprisingly didn't notice the helper isn't endian safe.


Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Ben Horgan 5 months, 2 weeks ago

Hi James,

On 8/22/25 16:29, James Morse wrote:
> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
> only be accessible from those CPUs, and they may not be online.
> Touching the hardware early is pointless as MPAM can't be used until
> the system-wide common values for num_partid and num_pmg have been
> discovered.
> 
> Start with driver probe/remove and mapping the MSC.
> 
> CC: Carl Worth <carl@os.amperecomputing.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since RFC:
>  * Check for status=broken DT devices.
>  * Moved all the files around.
>  * Made Kconfig symbols depend on EXPERT
> ---
>  arch/arm64/Kconfig              |   1 +
>  drivers/Kconfig                 |   2 +
>  drivers/Makefile                |   1 +
>  drivers/resctrl/Kconfig         |  11 ++
>  drivers/resctrl/Makefile        |   4 +
>  drivers/resctrl/mpam_devices.c  | 336 ++++++++++++++++++++++++++++++++
>  drivers/resctrl/mpam_internal.h |  62 ++++++
>  7 files changed, 417 insertions(+)
>  create mode 100644 drivers/resctrl/Kconfig
>  create mode 100644 drivers/resctrl/Makefile
>  create mode 100644 drivers/resctrl/mpam_devices.c
>  create mode 100644 drivers/resctrl/mpam_internal.h
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index e51ccf1da102..ea3c54e04275 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2062,6 +2062,7 @@ config ARM64_TLB_RANGE
>  
>  config ARM64_MPAM
>  	bool "Enable support for MPAM"
> +	select ARM64_MPAM_DRIVER
>  	select ACPI_MPAM if ACPI
>  	help
>  	  Memory Partitioning and Monitoring is an optional extension
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index 4915a63866b0..3054b50a2f4c 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
>  
>  source "drivers/cdx/Kconfig"
>  
> +source "drivers/resctrl/Kconfig"
> +
>  endmenu
> diff --git a/drivers/Makefile b/drivers/Makefile
> index b5749cf67044..f41cf4eddeba 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -194,5 +194,6 @@ obj-$(CONFIG_HTE)		+= hte/
>  obj-$(CONFIG_DRM_ACCEL)		+= accel/
>  obj-$(CONFIG_CDX_BUS)		+= cdx/
>  obj-$(CONFIG_DPLL)		+= dpll/
> +obj-y				+= resctrl/
>  
>  obj-$(CONFIG_S390)		+= s390/
> diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
> new file mode 100644
> index 000000000000..dff7b87280ab
> --- /dev/null
> +++ b/drivers/resctrl/Kconfig
> @@ -0,0 +1,11 @@
> +# Confusingly, this is everything but the CPU bits of MPAM. CPU here means
> +# CPU resources, not containers or cgroups etc.
> +config ARM64_MPAM_DRIVER
> +	bool "MPAM driver for System IP, e,g. caches and memory controllers"
> +	depends on ARM64_MPAM && EXPERT
> +
> +config ARM64_MPAM_DRIVER_DEBUG
> +	bool "Enable debug messages from the MPAM driver."
> +	depends on ARM64_MPAM_DRIVER
> +	help
> +	  Say yes here to enable debug messages from the MPAM driver.
> diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
> new file mode 100644
> index 000000000000..92b48fa20108
> --- /dev/null
> +++ b/drivers/resctrl/Makefile
> @@ -0,0 +1,4 @@
> +obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
> +mpam-y						+= mpam_devices.o
> +
> +cflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> new file mode 100644
> index 000000000000..a0d9a699a6e7
> --- /dev/null
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2025 Arm Ltd.
> +
> +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
> +
> +#include <linux/acpi.h>
> +#include <linux/arm_mpam.h>
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/errno.h>
> +#include <linux/gfp.h>
> +#include <linux/list.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/printk.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/srcu.h>
> +#include <linux/types.h>
> +
> +#include <acpi/pcc.h>
> +
> +#include "mpam_internal.h"
> +
> +/*
> + * mpam_list_lock protects the SRCU lists when writing. Once the
> + * mpam_enabled key is enabled these lists are read-only,
> + * unless the error interrupt disables the driver.
> + */
> +static DEFINE_MUTEX(mpam_list_lock);
> +static LIST_HEAD(mpam_all_msc);
> +
> +static struct srcu_struct mpam_srcu;
> +
> +/* MPAM isn't available until all the MSC have been probed. */
> +static u32 mpam_num_msc;
> +
> +static void mpam_discovery_complete(void)
> +{
> +	pr_err("Discovered all MSC\n");
> +}
> +
> +static int mpam_dt_count_msc(void)
> +{
> +	int count = 0;
> +	struct device_node *np;
> +
> +	for_each_compatible_node(np, NULL, "arm,mpam-msc") {
> +		if (of_device_is_available(np))
> +			count++;
> +	}
> +
> +	return count;
> +}
> +
> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
> +				  u32 ris_idx)
> +{
> +	int err = 0;
> +	u32 level = 0;
> +	unsigned long cache_id;
> +	struct device_node *cache;
> +
> +	do {
> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
> +			if (!cache) {
> +				pr_err("Failed to read phandle\n");
> +				break;
> +			}
This looks like this allows "arm,mpam-cache" and "arm,mpam-device" to be
used on an msc node when there are no ris children. This usage could be
reasonable but doesn't match the schema in the previous patch. Should
this usage be rejected or the schema extended?

> +		} else if (of_device_is_compatible(np->parent, "cache")) {
> +			cache = of_node_get(np->parent);
> +		} else {
> +			/* For now, only caches are supported */
> +			cache = NULL;
> +			break;
> +		}
> +
> +		err = of_property_read_u32(cache, "cache-level", &level);
> +		if (err) {
> +			pr_err("Failed to read cache-level\n");
> +			break;
> +		}
> +
> +		cache_id = cache_of_calculate_id(cache);
> +		if (cache_id == ~0UL) {
> +			err = -ENOENT;
> +			break;
> +		}
> +
> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
> +				      cache_id);
> +	} while (0);
> +	of_node_put(cache);
> +
> +	return err;
> +}
> +
> +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored)
> +{
> +	int err, num_ris = 0;
> +	const u32 *ris_idx_p;
> +	struct device_node *iter, *np;
> +
> +	np = msc->pdev->dev.of_node;
> +	for_each_child_of_node(np, iter) {
> +		ris_idx_p = of_get_property(iter, "reg", NULL);
> +		if (ris_idx_p) {
> +			num_ris++;
> +			err = mpam_dt_parse_resource(msc, iter, *ris_idx_p);
> +			if (err) {
> +				of_node_put(iter);
> +				return err;
> +			}
> +		}
> +	}
> +
> +	if (!num_ris)
> +		mpam_dt_parse_resource(msc, np, 0);
> +
> +	return err;
> +}
> +
> +/*
> + * An MSC can control traffic from a set of CPUs, but may only be accessible
> + * from a (hopefully wider) set of CPUs. The common reason for this is power
> + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the
> + * the corresponding cache may also be powered off. By making accesses from
> + * one of those CPUs, we ensure this isn't the case.
> + */
> +static int update_msc_accessibility(struct mpam_msc *msc)
> +{
> +	struct device_node *parent;
> +	u32 affinity_id;
> +	int err;
> +
> +	if (!acpi_disabled) {
> +		err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity",
> +					       &affinity_id);
> +		if (err)
> +			cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		else
> +			acpi_pptt_get_cpus_from_container(affinity_id,
> +							  &msc->accessibility);
> +
> +		return 0;
> +	}
> +
> +	/* This depends on the path to of_node */
> +	parent = of_get_parent(msc->pdev->dev.of_node);
> +	if (parent == of_root) {
> +		cpumask_copy(&msc->accessibility, cpu_possible_mask);
> +		err = 0;
> +	} else {
> +		err = -EINVAL;
> +		pr_err("Cannot determine accessibility of MSC: %s\n",
> +		       dev_name(&msc->pdev->dev));
> +	}
> +	of_node_put(parent);
> +
> +	return err;
> +}
> +
> +static int fw_num_msc;
> +
> +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg)
> +{
> +	/* TODO: wake up tasks blocked on this MSC's PCC channel */
> +}
> +
> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> +{
> +	struct mpam_msc *msc = platform_get_drvdata(pdev);
> +
> +	if (!msc)
> +		return;
> +
> +	mutex_lock(&mpam_list_lock);
> +	mpam_num_msc--;
> +	platform_set_drvdata(pdev, NULL);
> +	list_del_rcu(&msc->glbl_list);
> +	synchronize_srcu(&mpam_srcu);
> +	devm_kfree(&pdev->dev, msc);
> +	mutex_unlock(&mpam_list_lock);
> +}
> +
> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> +{
> +	int err;
> +	struct mpam_msc *msc;
> +	struct resource *msc_res;
> +	void *plat_data = pdev->dev.platform_data;
> +
> +	mutex_lock(&mpam_list_lock);
> +	do {
> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
> +		if (!msc) {
> +			err = -ENOMEM;
> +			break;
> +		}
> +
> +		mutex_init(&msc->probe_lock);
> +		mutex_init(&msc->part_sel_lock);
> +		mutex_init(&msc->outer_mon_sel_lock);
> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
> +		msc->id = mpam_num_msc++;
> +		msc->pdev = pdev;
> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);
> +		INIT_LIST_HEAD_RCU(&msc->ris);
> +
> +		err = update_msc_accessibility(msc);
> +		if (err)
> +			break;
> +		if (cpumask_empty(&msc->accessibility)) {
> +			pr_err_once("msc:%u is not accessible from any CPU!",
> +				    msc->id);
> +			err = -EINVAL;
> +			break;
> +		}
> +
> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
> +					     &msc->pcc_subspace_id))
> +			msc->iface = MPAM_IFACE_MMIO;
> +		else
> +			msc->iface = MPAM_IFACE_PCC;
> +
> +		if (msc->iface == MPAM_IFACE_MMIO) {
> +			void __iomem *io;
> +
> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
> +								    &msc_res);
> +			if (IS_ERR(io)) {
> +				pr_err("Failed to map MSC base address\n");
> +				err = PTR_ERR(io);
> +				break;
> +			}
> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
> +			msc->mapped_hwpage = io;
> +		} else if (msc->iface == MPAM_IFACE_PCC) {
> +			msc->pcc_cl.dev = &pdev->dev;
> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
> +			msc->pcc_cl.tx_block = false;
> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
> +			msc->pcc_cl.knows_txdone = false;
> +
> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
> +								 msc->pcc_subspace_id);
> +			if (IS_ERR(msc->pcc_chan)) {
> +				pr_err("Failed to request MSC PCC channel\n");
> +				err = PTR_ERR(msc->pcc_chan);
> +				break;
> +			}
I don't see pcc support added in this series. Should we fail the probe
if this interface is specified?

(If keeping, there is a missing pcc_mbox_free_channel() on the error path.)

> +		}
> +
> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
> +		platform_set_drvdata(pdev, msc);
> +	} while (0);
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (!err) {
> +		/* Create RIS entries described by firmware */
> +		if (!acpi_disabled)
> +			err = acpi_mpam_parse_resources(msc, plat_data);
> +		else
> +			err = mpam_dt_parse_resources(msc, plat_data);
> +	}
> +
> +	if (!err && fw_num_msc == mpam_num_msc)
> +		mpam_discovery_complete();
> +
> +	if (err && msc)
> +		mpam_msc_drv_remove(pdev);
> +
> +	return err;
> +}
> +
> +static const struct of_device_id mpam_of_match[] = {
> +	{ .compatible = "arm,mpam-msc", },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, mpam_of_match);
> +
> +static struct platform_driver mpam_msc_driver = {
> +	.driver = {
> +		.name = "mpam_msc",
> +		.of_match_table = of_match_ptr(mpam_of_match),
> +	},
> +	.probe = mpam_msc_drv_probe,
> +	.remove = mpam_msc_drv_remove,
> +};
> +
> +/*
> + * MSC that are hidden under caches are not created as platform devices
> + * as there is no cache driver. Caches are also special-cased in
> + * update_msc_accessibility().
> + */
> +static void mpam_dt_create_foundling_msc(void)
> +{
> +	int err;
> +	struct device_node *cache;
> +
> +	for_each_compatible_node(cache, NULL, "cache") {
> +		err = of_platform_populate(cache, mpam_of_match, NULL, NULL);
> +		if (err)
> +			pr_err("Failed to create MSC devices under caches\n");
> +	}
> +}
> +
> +static int __init mpam_msc_driver_init(void)
> +{
> +	if (!system_supports_mpam())
> +		return -EOPNOTSUPP;
> +
> +	init_srcu_struct(&mpam_srcu);
> +
> +	if (!acpi_disabled)
> +		fw_num_msc = acpi_mpam_count_msc();
> +	else
> +		fw_num_msc = mpam_dt_count_msc();
> +
> +	if (fw_num_msc <= 0) {
> +		pr_err("No MSC devices found in firmware\n");
> +		return -EINVAL;
> +	}
> +
> +	if (acpi_disabled)
> +		mpam_dt_create_foundling_msc();
> +
> +	return platform_driver_register(&mpam_msc_driver);
> +}
> +subsys_initcall(mpam_msc_driver_init);
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> new file mode 100644
> index 000000000000..07e0f240eaca
> --- /dev/null
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +// Copyright (C) 2024 Arm Ltd.
> +
> +#ifndef MPAM_INTERNAL_H
> +#define MPAM_INTERNAL_H
> +
> +#include <linux/arm_mpam.h>
> +#include <linux/cpumask.h>
> +#include <linux/io.h>
> +#include <linux/mailbox_client.h>
> +#include <linux/mutex.h>
> +#include <linux/resctrl.h>
> +#include <linux/sizes.h>
> +
> +struct mpam_msc {
> +	/* member of mpam_all_msc */
> +	struct list_head        glbl_list;
> +
> +	int			id;
> +	struct platform_device *pdev;
> +
> +	/* Not modified after mpam_is_enabled() becomes true */
> +	enum mpam_msc_iface	iface;
> +	u32			pcc_subspace_id;
> +	struct mbox_client	pcc_cl;
> +	struct pcc_mbox_chan	*pcc_chan;
> +	u32			nrdy_usec;
> +	cpumask_t		accessibility;
> +
> +	/*
> +	 * probe_lock is only take during discovery. After discovery these
nit: s/take/taken/
> +	 * properties become read-only and the lists are protected by SRCU.
> +	 */
> +	struct mutex		probe_lock;
> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];
> +	u32			ris_max;
> +
> +	/* mpam_msc_ris of this component */
> +	struct list_head	ris;
> +
> +	/*
> +	 * part_sel_lock protects access to the MSC hardware registers that are
> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
> +	 * by RIS).
> +	 * If needed, take msc->lock first.
> +	 */
> +	struct mutex		part_sel_lock;
> +
> +	/*
> +	 * mon_sel_lock protects access to the MSC hardware registers that are
> +	 * affeted by MPAMCFG_MON_SEL.
nit: s/affeted/affected/
> +	 * If needed, take msc->lock first.
> +	 */
> +	struct mutex		outer_mon_sel_lock;
> +	raw_spinlock_t		inner_mon_sel_lock;
> +	unsigned long		inner_mon_sel_flags;
> +
> +	void __iomem		*mapped_hwpage;
> +	size_t			mapped_hwpage_sz;
> +};
> +
> +#endif /* MPAM_INTERNAL_H */

Thanks,

Ben

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by James Morse 5 months ago

Hi Ben,

On 27/08/2025 14:03, Ben Horgan wrote:
> On 8/22/25 16:29, James Morse wrote:
>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>> only be accessible from those CPUs, and they may not be online.
>> Touching the hardware early is pointless as MPAM can't be used until
>> the system-wide common values for num_partid and num_pmg have been
>> discovered.
>>
>> Start with driver probe/remove and mapping the MSC.

>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>> new file mode 100644
>> index 000000000000..a0d9a699a6e7
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_devices.c
>> @@ -0,0 +1,336 @@

>> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
>> +				  u32 ris_idx)
>> +{
>> +	int err = 0;
>> +	u32 level = 0;
>> +	unsigned long cache_id;
>> +	struct device_node *cache;
>> +
>> +	do {
>> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
>> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
>> +			if (!cache) {
>> +				pr_err("Failed to read phandle\n");
>> +				break;
>> +			}
> This looks like this allows "arm,mpam-cache" and "arm,mpam-device" to be
> used on an msc node when there are no ris children. This usage could be
> reasonable but doesn't match the schema in the previous patch. Should
> this usage be rejected or the schema extended?

The DT/ACPI stuff is only going to describe the things that make sense at a high level,
e.g. the controls for the L3. There may be other controls for stuff that doesn't make
sense in the hardware - these get discovered, grouped as 'unknown' and left alone.

Another angle on this is where there is an MSC that the OS will never make use of, but
needs to know about to find the system wide minimum value. (there is a comment about
this in the ACPI spec...)

I don't think its a problem if the magic dt-binding machinery is overly restrictive, that
is about validating DTB files...


>> +		} else if (of_device_is_compatible(np->parent, "cache")) {
>> +			cache = of_node_get(np->parent);
>> +		} else {
>> +			/* For now, only caches are supported */
>> +			cache = NULL;
>> +			break;
>> +		}
>> +
>> +		err = of_property_read_u32(cache, "cache-level", &level);
>> +		if (err) {
>> +			pr_err("Failed to read cache-level\n");
>> +			break;
>> +		}
>> +
>> +		cache_id = cache_of_calculate_id(cache);
>> +		if (cache_id == ~0UL) {
>> +			err = -ENOENT;
>> +			break;
>> +		}
>> +
>> +		err = mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level,
>> +				      cache_id);
>> +	} while (0);
>> +	of_node_put(cache);
>> +
>> +	return err;
>> +}

>> +static int mpam_msc_drv_probe(struct platform_device *pdev)
>> +{
>> +	int err;
>> +	struct mpam_msc *msc;
>> +	struct resource *msc_res;
>> +	void *plat_data = pdev->dev.platform_data;
>> +
>> +	mutex_lock(&mpam_list_lock);
>> +	do {
>> +		msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL);
>> +		if (!msc) {
>> +			err = -ENOMEM;
>> +			break;
>> +		}
>> +
>> +		mutex_init(&msc->probe_lock);
>> +		mutex_init(&msc->part_sel_lock);
>> +		mutex_init(&msc->outer_mon_sel_lock);
>> +		raw_spin_lock_init(&msc->inner_mon_sel_lock);
>> +		msc->id = mpam_num_msc++;
>> +		msc->pdev = pdev;
>> +		INIT_LIST_HEAD_RCU(&msc->glbl_list);
>> +		INIT_LIST_HEAD_RCU(&msc->ris);
>> +
>> +		err = update_msc_accessibility(msc);
>> +		if (err)
>> +			break;
>> +		if (cpumask_empty(&msc->accessibility)) {
>> +			pr_err_once("msc:%u is not accessible from any CPU!",
>> +				    msc->id);
>> +			err = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		if (device_property_read_u32(&pdev->dev, "pcc-channel",
>> +					     &msc->pcc_subspace_id))
>> +			msc->iface = MPAM_IFACE_MMIO;
>> +		else
>> +			msc->iface = MPAM_IFACE_PCC;
>> +
>> +		if (msc->iface == MPAM_IFACE_MMIO) {
>> +			void __iomem *io;
>> +
>> +			io = devm_platform_get_and_ioremap_resource(pdev, 0,
>> +								    &msc_res);
>> +			if (IS_ERR(io)) {
>> +				pr_err("Failed to map MSC base address\n");
>> +				err = PTR_ERR(io);
>> +				break;
>> +			}
>> +			msc->mapped_hwpage_sz = msc_res->end - msc_res->start;
>> +			msc->mapped_hwpage = io;
>> +		} else if (msc->iface == MPAM_IFACE_PCC) {
>> +			msc->pcc_cl.dev = &pdev->dev;
>> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
>> +			msc->pcc_cl.tx_block = false;
>> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
>> +			msc->pcc_cl.knows_txdone = false;
>> +
>> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
>> +								 msc->pcc_subspace_id);
>> +			if (IS_ERR(msc->pcc_chan)) {
>> +				pr_err("Failed to request MSC PCC channel\n");
>> +				err = PTR_ERR(msc->pcc_chan);
>> +				break;
>> +			}
> I don't see pcc support added in this series. Should we fail the probe
> if this interface is specified?

I've got patches from Andre P to support it on DT - but the platforms that need it keeping
popping in and out of existence. I'll pull these bits out - they were intended to check
the ACPI table wasn't totally rotten...


> (If keeping, there is a missing pcc_mbox_free_channel() on the error path.)

When pcc_mbox_request_channel() fails? It already called mbox_free_channel() itself.


>> +		}
>> +
>> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
>> +		platform_set_drvdata(pdev, msc);
>> +	} while (0);
>> +	mutex_unlock(&mpam_list_lock);
>> +
>> +	if (!err) {
>> +		/* Create RIS entries described by firmware */
>> +		if (!acpi_disabled)
>> +			err = acpi_mpam_parse_resources(msc, plat_data);
>> +		else
>> +			err = mpam_dt_parse_resources(msc, plat_data);
>> +	}
>> +
>> +	if (!err && fw_num_msc == mpam_num_msc)
>> +		mpam_discovery_complete();
>> +
>> +	if (err && msc)
>> +		mpam_msc_drv_remove(pdev);
>> +
>> +	return err;
>> +}

>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> new file mode 100644
>> index 000000000000..07e0f240eaca
>> --- /dev/null
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -0,0 +1,62 @@

>> +struct mpam_msc {
>> +	/* member of mpam_all_msc */
>> +	struct list_head        glbl_list;
>> +
>> +	int			id;
>> +	struct platform_device *pdev;
>> +
>> +	/* Not modified after mpam_is_enabled() becomes true */
>> +	enum mpam_msc_iface	iface;
>> +	u32			pcc_subspace_id;
>> +	struct mbox_client	pcc_cl;
>> +	struct pcc_mbox_chan	*pcc_chan;
>> +	u32			nrdy_usec;
>> +	cpumask_t		accessibility;
>> +
>> +	/*
>> +	 * probe_lock is only take during discovery. After discovery these
> nit: s/take/taken/

Fixed,

>> +	 * properties become read-only and the lists are protected by SRCU.
>> +	 */
>> +	struct mutex		probe_lock;
>> +	unsigned long		ris_idxs[128 / BITS_PER_LONG];
>> +	u32			ris_max;
>> +
>> +	/* mpam_msc_ris of this component */
>> +	struct list_head	ris;
>> +
>> +	/*
>> +	 * part_sel_lock protects access to the MSC hardware registers that are
>> +	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>> +	 * by RIS).
>> +	 * If needed, take msc->lock first.
>> +	 */
>> +	struct mutex		part_sel_lock;
>> +
>> +	/*
>> +	 * mon_sel_lock protects access to the MSC hardware registers that are
>> +	 * affeted by MPAMCFG_MON_SEL.
> nit: s/affeted/affected/
Fixed,


>> +	 * If needed, take msc->lock first.
>> +	 */
>> +	struct mutex		outer_mon_sel_lock;
>> +	raw_spinlock_t		inner_mon_sel_lock;
>> +	unsigned long		inner_mon_sel_flags;
>> +
>> +	void __iomem		*mapped_hwpage;
>> +	size_t			mapped_hwpage_sz;
>> +};
>> +
>> +#endif /* MPAM_INTERNAL_H */

Thanks,

James

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Ben Horgan 5 months ago

Hi James,

On 9/5/25 19:48, James Morse wrote:
> Hi Ben,
> 
> On 27/08/2025 14:03, Ben Horgan wrote:
>> On 8/22/25 16:29, James Morse wrote:
>>> Probing MPAM is convoluted. MSCs that are integrated with a CPU may
>>> only be accessible from those CPUs, and they may not be online.
>>> Touching the hardware early is pointless as MPAM can't be used until
>>> the system-wide common values for num_partid and num_pmg have been
>>> discovered.
>>>
>>> Start with driver probe/remove and mapping the MSC.
> 
>>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>>> new file mode 100644
>>> index 000000000000..a0d9a699a6e7
>>> --- /dev/null
>>> +++ b/drivers/resctrl/mpam_devices.c
>>> @@ -0,0 +1,336 @@
> 
>>> +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np,
>>> +				  u32 ris_idx)
>>> +{
>>> +	int err = 0;
>>> +	u32 level = 0;
>>> +	unsigned long cache_id;
>>> +	struct device_node *cache;
>>> +
>>> +	do {
>>> +		if (of_device_is_compatible(np, "arm,mpam-cache")) {
>>> +			cache = of_parse_phandle(np, "arm,mpam-device", 0);
>>> +			if (!cache) {
>>> +				pr_err("Failed to read phandle\n");
>>> +				break;
>>> +			}
>> This looks like this allows "arm,mpam-cache" and "arm,mpam-device" to be
>> used on an msc node when there are no ris children. This usage could be
>> reasonable but doesn't match the schema in the previous patch. Should
>> this usage be rejected or the schema extended?
> 
> The DT/ACPI stuff is only going to describe the things that make sense at a high level,
> e.g. the controls for the L3. There may be other controls for stuff that doesn't make
> sense in the hardware - these get discovered, grouped as 'unknown' and left alone.
> 
> Another angle on this is where there is an MSC that the OS will never make use of, but
> needs to know about to find the system wide minimum value. (there is a comment about
> this in the ACPI spec...)
> 
> I don't think its a problem if the magic dt-binding machinery is overly restrictive, that
> is about validating DTB files...

I agree with your points. However, I was rather thinking that the code
allows more ways to describe the same thing than the schema does. In
that, you could write something like:


msc@80000 {
        compatible = "foo,a-standalone-msc";
        reg = <0x80000 0x1000>;

	...
        msc@10000 {
            compatible = "arm,mpam-msc arm,mpam-cache";
            arm,mpam-device = <&mem>;
            ...
         }
}

Although, now I've written this out, it doesn't seem sensible to worry
about this. Using ris compatibles on an msc, as in my example, is
clearly an error.

> 
>
[snip]
>>> +		} else if (msc->iface == MPAM_IFACE_PCC) {
>>> +			msc->pcc_cl.dev = &pdev->dev;
>>> +			msc->pcc_cl.rx_callback = mpam_pcc_rx_callback;
>>> +			msc->pcc_cl.tx_block = false;
>>> +			msc->pcc_cl.tx_tout = 1000; /* 1s */
>>> +			msc->pcc_cl.knows_txdone = false;
>>> +
>>> +			msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl,
>>> +								 msc->pcc_subspace_id);
>>> +			if (IS_ERR(msc->pcc_chan)) {
>>> +				pr_err("Failed to request MSC PCC channel\n");
>>> +				err = PTR_ERR(msc->pcc_chan);
>>> +				break;
>>> +			}
>> I don't see pcc support added in this series. Should we fail the probe
>> if this interface is specified?
> 
> I've got patches from Andre P to support it on DT - but the platforms that need it keeping
> popping in and out of existence. I'll pull these bits out - they were intended to check
> the ACPI table wasn't totally rotten...
> 
> 
>> (If keeping, there is a missing pcc_mbox_free_channel() on the error path.)
> 
> When pcc_mbox_request_channel() fails? It already called mbox_free_channel() itself.
Apologies, this was relating to if the *_parse_resources() call below
failed.

> 
> 
>>> +		}
>>> +
>>> +		list_add_rcu(&msc->glbl_list, &mpam_all_msc);
>>> +		platform_set_drvdata(pdev, msc);
>>> +	} while (0);
>>> +	mutex_unlock(&mpam_list_lock);
>>> +
>>> +	if (!err) {
>>> +		/* Create RIS entries described by firmware */
>>> +		if (!acpi_disabled)
>>> +			err = acpi_mpam_parse_resources(msc, plat_data);
>>> +		else
>>> +			err = mpam_dt_parse_resources(msc, plat_data);
>>> +	}
>>> +
>>> +	if (!err && fw_num_msc == mpam_num_msc)
>>> +		mpam_discovery_complete();
>>> +
>>> +	if (err && msc)
>>> +		mpam_msc_drv_remove(pdev);
>>> +
>>> +	return err;
>>> +}
[snip]>
> Thanks,
> 
> James
Thanks,

Ben

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Markus Elfring 5 months, 2 weeks ago

…
…
> +static int mpam_msc_drv_probe(struct platform_device *pdev)
> +{
…
> +	} while (0);
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (!err) {
> +		/* Create RIS entries described by firmware */
> +		if (!acpi_disabled)
> +			err = acpi_mpam_parse_resources(msc, plat_data);
> +		else
> +			err = mpam_dt_parse_resources(msc, plat_data);
> +	}
> +
> +	if (!err && fw_num_msc == mpam_num_msc)
> +		mpam_discovery_complete();
> +
> +	if (err && msc)
> +		mpam_msc_drv_remove(pdev);
> +
> +	return err;
> +}
…

* Would you like to integrate anything from the following source code variant?

	if (!err)
		/* Create RIS entries described by firmware */
		err = acpi_disabled
		      ? mpam_dt_parse_resources(msc, plat_data)
		      : acpi_mpam_parse_resources(msc, plat_data);

	if (err) {
		if (msc)
			mpam_msc_drv_remove(pdev);
	} else {
		if (fw_num_msc == mpam_num_msc)
			mpam_discovery_complete();
	}

* How do you think about to increase the application of scope-based resource management
  at further places?


Regards,
Markus

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Greg Kroah-Hartman 5 months, 2 weeks ago

On Fri, Aug 22, 2025 at 09:55:33PM +0200, Markus Elfring wrote:
> …
> …
> > +static int mpam_msc_drv_probe(struct platform_device *pdev)
> > +{
> …
> > +	} while (0);
> > +	mutex_unlock(&mpam_list_lock);
> > +
> > +	if (!err) {
> > +		/* Create RIS entries described by firmware */
> > +		if (!acpi_disabled)
> > +			err = acpi_mpam_parse_resources(msc, plat_data);
> > +		else
> > +			err = mpam_dt_parse_resources(msc, plat_data);
> > +	}
> > +
> > +	if (!err && fw_num_msc == mpam_num_msc)
> > +		mpam_discovery_complete();
> > +
> > +	if (err && msc)
> > +		mpam_msc_drv_remove(pdev);
> > +
> > +	return err;
> > +}
> …
> 
> * Would you like to integrate anything from the following source code variant?
> 
> 	if (!err)
> 		/* Create RIS entries described by firmware */
> 		err = acpi_disabled
> 		      ? mpam_dt_parse_resources(msc, plat_data)
> 		      : acpi_mpam_parse_resources(msc, plat_data);
> 
> 	if (err) {
> 		if (msc)
> 			mpam_msc_drv_remove(pdev);
> 	} else {
> 		if (fw_num_msc == mpam_num_msc)
> 			mpam_discovery_complete();
> 	}
> 
> * How do you think about to increase the application of scope-based resource management
>   at further places?
> 
> 
> Regards,
> Markus

Hi,

This is the semi-friendly patch-bot of Greg Kroah-Hartman.

Markus, you seem to have sent a nonsensical or otherwise pointless
review comment to a patch submission on a Linux kernel developer mailing
list.  I strongly suggest that you not do this anymore.  Please do not
bother developers who are actively working to produce patches and
features with comments that, in the end, are a waste of time.

Patch submitter, please ignore Markus's suggestion; you do not need to
follow it at all.  The person/bot/AI that sent it is being ignored by
almost all Linux kernel maintainers for having a persistent pattern of
behavior of producing distracting and pointless commentary, and
inability to adapt to feedback.  Please feel free to also ignore emails
from them.

thanks,

greg k-h's patch email bot

Re: [PATCH 10/33] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate

Posted by Markus Elfring 5 months, 2 weeks ago

…
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -0,0 +1,336 @@
…
> +static void mpam_msc_drv_remove(struct platform_device *pdev)
> +{
…
> +	mutex_lock(&mpam_list_lock);
> +	mpam_num_msc--;
…
> +	devm_kfree(&pdev->dev, msc);
> +	mutex_unlock(&mpam_list_lock);
> +}
…

Under which circumstances would you become interested to apply a statement
like “guard(mutex)(&mpam_list_lock);”?
https://elixir.bootlin.com/linux/v6.17-rc2/source/include/linux/mutex.h#L228

Regards,
Markus