[v2] arm_mpam: Add basic mpam driver

[PATCH v2 10/29] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 3 weeks, 1 day ago

Because an MSC can only by accessed from the CPUs in its cpu-affinity
set we need to be running on one of those CPUs to probe the MSC
hardware.

Do this work in the cpuhp callback. Probing the hardware will only
happen before MPAM is enabled, walk all the MSCs and probe those we can
reach that haven't already been probed as each CPU's online call is made.

This adds the low-level MSC register accessors.

Once all MSCs reported by the firmware have been probed from a CPU in
their respective cpu-affinity set, the probe-time cpuhp callbacks are
replaced.  The replacement callbacks will ultimately need to handle
save/restore of the runtime MSC state across power transitions, but for
now there is nothing to do in them: so do nothing.

The architecture's context switch code will be enabled by a static-key,
this can be set by mpam_enable(), but must be done from process context,
not a cpuhp callback because both take the cpuhp lock.
Whenever a new MSC has been probed, the mpam_enable() work is scheduled
to test if all the MSCs have been probed. If probing fails, mpam_disable()
is scheduled to unregister the cpuhp callbacks and free memory.

CC: Lecopzer Chen <lecopzerc@nvidia.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
Changes since v1:
 * Removed register bounds check. If the firmware tables are wrong the
   resulting translation fault should be enough to debug this.
 * Removed '&' in front of a function pointer.
 * Pulled mpam_disable() into this patch.
 * Disable mpam when probing fails to avoid extra work on broken platforms.
 * Added mpam_disbale_reason as there are now two non-debug reasons for this
   to happen.
---
 drivers/resctrl/mpam_devices.c  | 173 +++++++++++++++++++++++++++++++-
 drivers/resctrl/mpam_internal.h |   5 +
 2 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index c7f4981b3545..c265376d936b 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -4,6 +4,7 @@
 #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
 
 #include <linux/acpi.h>
+#include <linux/atomic.h>
 #include <linux/arm_mpam.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
@@ -19,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include "mpam_internal.h"
 
@@ -38,6 +40,22 @@ struct srcu_struct mpam_srcu;
  */
 static atomic_t mpam_num_msc;
 
+static int mpam_cpuhp_state;
+static DEFINE_MUTEX(mpam_cpuhp_state_lock);
+
+/*
+ * mpam is enabled once all devices have been probed from CPU online callbacks,
+ * scheduled via this work_struct. If access to an MSC depends on a CPU that
+ * was not brought online at boot, this can happen surprisingly late.
+ */
+static DECLARE_WORK(mpam_enable_work, &mpam_enable);
+
+/*
+ * All mpam error interrupts indicate a software bug. On receipt, disable the
+ * driver.
+ */
+static DECLARE_WORK(mpam_broken_work, &mpam_disable);
+
 /*
  * An MSC is a physical container for controls and monitors, each identified by
  * their RIS index. These share a base-address, interrupts and some MMIO
@@ -77,6 +95,24 @@ LIST_HEAD(mpam_classes);
 /* List of all objects that can be free()d after synchronise_srcu() */
 static LLIST_HEAD(mpam_garbage);
 
+/* When mpam is disabled, the printed reason to aid debugging */
+static char *mpam_disable_reason;
+
+static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg)
+{
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	return readl_relaxed(msc->mapped_hwpage + reg);
+}
+
+static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
+{
+	lockdep_assert_held_once(&msc->part_sel_lock);
+	return __mpam_read_reg(msc, reg);
+}
+
+#define mpam_read_partsel_reg(msc, reg)        _mpam_read_partsel_reg(msc, MPAMF_##reg)
+
 #define init_garbage(x)	init_llist_node(&(x)->garbage.llist)
 
 static struct mpam_vmsc *
@@ -434,6 +470,86 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 	return err;
 }
 
+static int mpam_msc_hw_probe(struct mpam_msc *msc)
+{
+	u64 idr;
+	struct device *dev = &msc->pdev->dev;
+
+	lockdep_assert_held(&msc->probe_lock);
+
+	idr = __mpam_read_reg(msc, MPAMF_AIDR);
+	if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
+		dev_err_once(dev, "MSC does not match MPAM architecture v1.x\n");
+		return -EIO;
+	}
+
+	msc->probed = true;
+
+	return 0;
+}
+
+static int mpam_cpu_online(unsigned int cpu)
+{
+	return 0;
+}
+
+/* Before mpam is enabled, try to probe new MSC */
+static int mpam_discovery_cpu_online(unsigned int cpu)
+{
+	int err = 0;
+	struct mpam_msc *msc;
+	bool new_device_probed = false;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (!cpumask_test_cpu(cpu, &msc->accessibility))
+			continue;
+
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			err = mpam_msc_hw_probe(msc);
+		mutex_unlock(&msc->probe_lock);
+
+		if (!err)
+			new_device_probed = true;
+		else
+			break;
+	}
+
+	if (new_device_probed && !err)
+		schedule_work(&mpam_enable_work);
+	if (err) {
+		mpam_disable_reason = "error during probing";
+		schedule_work(&mpam_broken_work);
+	}
+
+	return err;
+}
+
+static int mpam_cpu_offline(unsigned int cpu)
+{
+	return 0;
+}
+
+static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
+					  int (*offline)(unsigned int offline))
+{
+	mutex_lock(&mpam_cpuhp_state_lock);
+	if (mpam_cpuhp_state) {
+		cpuhp_remove_state(mpam_cpuhp_state);
+		mpam_cpuhp_state = 0;
+	}
+
+	mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online",
+					     online, offline);
+	if (mpam_cpuhp_state <= 0) {
+		pr_err("Failed to register cpuhp callbacks");
+		mpam_cpuhp_state = 0;
+	}
+	mutex_unlock(&mpam_cpuhp_state_lock);
+}
+
 /*
  * An MSC can control traffic from a set of CPUs, but may only be accessible
  * from a (hopefully wider) set of CPUs. The common reason for this is power
@@ -544,7 +660,7 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
 		mpam_msc_drv_remove(pdev);
 
 	if (!err && atomic_add_return(1, &mpam_num_msc) == fw_num_msc)
-		pr_info("Discovered all MSC\n");
+		mpam_register_cpuhp_callbacks(mpam_discovery_cpu_online, NULL);
 
 	return err;
 }
@@ -557,6 +673,61 @@ static struct platform_driver mpam_msc_driver = {
 	.remove = mpam_msc_drv_remove,
 };
 
+static void mpam_enable_once(void)
+{
+	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
+
+	pr_info("MPAM enabled\n");
+}
+
+void mpam_disable(struct work_struct *ignored)
+{
+	struct mpam_msc *msc, *tmp;
+
+	mutex_lock(&mpam_cpuhp_state_lock);
+	if (mpam_cpuhp_state) {
+		cpuhp_remove_state(mpam_cpuhp_state);
+		mpam_cpuhp_state = 0;
+	}
+	mutex_unlock(&mpam_cpuhp_state_lock);
+
+	mutex_lock(&mpam_list_lock);
+	list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list)
+		mpam_msc_destroy(msc);
+	mutex_unlock(&mpam_list_lock);
+	mpam_free_garbage();
+
+	pr_err_once("MPAM disabled due to %s\n", mpam_disable_reason);
+}
+
+/*
+ * Enable mpam once all devices have been probed.
+ * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
+ * Also scheduled when new devices are probed when new CPUs come online.
+ */
+void mpam_enable(struct work_struct *work)
+{
+	static atomic_t once;
+	struct mpam_msc *msc;
+	bool all_devices_probed = true;
+
+	/* Have we probed all the hw devices? */
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			all_devices_probed = false;
+		mutex_unlock(&msc->probe_lock);
+
+		if (!all_devices_probed)
+			break;
+	}
+
+	if (all_devices_probed && !atomic_fetch_inc(&once))
+		mpam_enable_once();
+}
+
 static int __init mpam_msc_driver_init(void)
 {
 	if (!system_supports_mpam())
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index 109f03df46c2..d4f3febc7a50 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -51,6 +51,7 @@ struct mpam_msc {
 	 * properties become read-only and the lists are protected by SRCU.
 	 */
 	struct mutex		probe_lock;
+	bool			probed;
 	unsigned long		ris_idxs;
 	u32			ris_max;
 
@@ -149,6 +150,10 @@ struct mpam_msc_ris {
 extern struct srcu_struct mpam_srcu;
 extern struct list_head mpam_classes;
 
+/* Scheduled work callback to enable mpam once all MSC have been probed */
+void mpam_enable(struct work_struct *work);
+void mpam_disable(struct work_struct *work);
+
 int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
 				   cpumask_t *affinity);
 
-- 
2.39.5

Re: [PATCH v2 10/29] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by Ben Horgan 2 weeks, 6 days ago

Hi James,

On 9/10/25 21:42, James Morse wrote:
> Because an MSC can only by accessed from the CPUs in its cpu-affinity
> set we need to be running on one of those CPUs to probe the MSC
> hardware.
> 
> Do this work in the cpuhp callback. Probing the hardware will only
> happen before MPAM is enabled, walk all the MSCs and probe those we can
> reach that haven't already been probed as each CPU's online call is made.
> 
> This adds the low-level MSC register accessors.
> 
> Once all MSCs reported by the firmware have been probed from a CPU in
> their respective cpu-affinity set, the probe-time cpuhp callbacks are
> replaced.  The replacement callbacks will ultimately need to handle
> save/restore of the runtime MSC state across power transitions, but for
> now there is nothing to do in them: so do nothing.
> 
> The architecture's context switch code will be enabled by a static-key,
> this can be set by mpam_enable(), but must be done from process context,
> not a cpuhp callback because both take the cpuhp lock.
> Whenever a new MSC has been probed, the mpam_enable() work is scheduled
> to test if all the MSCs have been probed. If probing fails, mpam_disable()
> is scheduled to unregister the cpuhp callbacks and free memory.
> 
> CC: Lecopzer Chen <lecopzerc@nvidia.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since v1:
>  * Removed register bounds check. If the firmware tables are wrong the
>    resulting translation fault should be enough to debug this.
>  * Removed '&' in front of a function pointer.
>  * Pulled mpam_disable() into this patch.
>  * Disable mpam when probing fails to avoid extra work on broken platforms.
>  * Added mpam_disbale_reason as there are now two non-debug reasons for this
>    to happen.

Looks good to me.

Reviewed-by: Ben Horgan <ben.horgan@arm.com>

Thanks,

Ben

Re: [PATCH v2 10/29] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 3 days, 3 hours ago

Hi Ben,

On 12/09/2025 11:42, Ben Horgan wrote:
> On 9/10/25 21:42, James Morse wrote:
>> Because an MSC can only by accessed from the CPUs in its cpu-affinity
>> set we need to be running on one of those CPUs to probe the MSC
>> hardware.
>>
>> Do this work in the cpuhp callback. Probing the hardware will only
>> happen before MPAM is enabled, walk all the MSCs and probe those we can
>> reach that haven't already been probed as each CPU's online call is made.
>>
>> This adds the low-level MSC register accessors.
>>
>> Once all MSCs reported by the firmware have been probed from a CPU in
>> their respective cpu-affinity set, the probe-time cpuhp callbacks are
>> replaced.  The replacement callbacks will ultimately need to handle
>> save/restore of the runtime MSC state across power transitions, but for
>> now there is nothing to do in them: so do nothing.
>>
>> The architecture's context switch code will be enabled by a static-key,
>> this can be set by mpam_enable(), but must be done from process context,
>> not a cpuhp callback because both take the cpuhp lock.
>> Whenever a new MSC has been probed, the mpam_enable() work is scheduled
>> to test if all the MSCs have been probed. If probing fails, mpam_disable()
>> is scheduled to unregister the cpuhp callbacks and free memory.
>>
>> CC: Lecopzer Chen <lecopzerc@nvidia.com>
>> Signed-off-by: James Morse <james.morse@arm.com>
>> ---
>> Changes since v1:
>>  * Removed register bounds check. If the firmware tables are wrong the
>>    resulting translation fault should be enough to debug this.
>>  * Removed '&' in front of a function pointer.
>>  * Pulled mpam_disable() into this patch.
>>  * Disable mpam when probing fails to avoid extra work on broken platforms.
>>  * Added mpam_disbale_reason as there are now two non-debug reasons for this
>>    to happen.
> 
> Looks good to me.
> 
> Reviewed-by: Ben Horgan <ben.horgan@arm.com>


Thanks!

James

Re: [PATCH v2 10/29] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by Jonathan Cameron 3 weeks ago

On Wed, 10 Sep 2025 20:42:50 +0000
James Morse <james.morse@arm.com> wrote:

> Because an MSC can only by accessed from the CPUs in its cpu-affinity
> set we need to be running on one of those CPUs to probe the MSC
> hardware.
> 
> Do this work in the cpuhp callback. Probing the hardware will only
> happen before MPAM is enabled, walk all the MSCs and probe those we can
> reach that haven't already been probed as each CPU's online call is made.
> 
> This adds the low-level MSC register accessors.
> 
> Once all MSCs reported by the firmware have been probed from a CPU in
> their respective cpu-affinity set, the probe-time cpuhp callbacks are
> replaced.  The replacement callbacks will ultimately need to handle
> save/restore of the runtime MSC state across power transitions, but for
> now there is nothing to do in them: so do nothing.
> 
> The architecture's context switch code will be enabled by a static-key,
> this can be set by mpam_enable(), but must be done from process context,
> not a cpuhp callback because both take the cpuhp lock.
> Whenever a new MSC has been probed, the mpam_enable() work is scheduled
> to test if all the MSCs have been probed. If probing fails, mpam_disable()
> is scheduled to unregister the cpuhp callbacks and free memory.
> 
> CC: Lecopzer Chen <lecopzerc@nvidia.com>
> Signed-off-by: James Morse <james.morse@arm.com>

Trivial suggestion inline. Either way
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>

> +
> +/* Before mpam is enabled, try to probe new MSC */
> +static int mpam_discovery_cpu_online(unsigned int cpu)
> +{
> +	int err = 0;
> +	struct mpam_msc *msc;
> +	bool new_device_probed = false;
> +
> +	guard(srcu)(&mpam_srcu);
> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
> +				 srcu_read_lock_held(&mpam_srcu)) {
> +		if (!cpumask_test_cpu(cpu, &msc->accessibility))
> +			continue;
> +
> +		mutex_lock(&msc->probe_lock);
> +		if (!msc->probed)
> +			err = mpam_msc_hw_probe(msc);
> +		mutex_unlock(&msc->probe_lock);
> +
> +		if (!err)
> +			new_device_probed = true;
> +		else
> +			break;
Unless this going to get more complex why not

		if (err)
			break;

		new_device_probed = true;
> +	}
> +
> +	if (new_device_probed && !err)
> +		schedule_work(&mpam_enable_work);
> +	if (err) {
> +		mpam_disable_reason = "error during probing";
> +		schedule_work(&mpam_broken_work);
> +	}
> +
> +	return err;
> +}

> +static void mpam_enable_once(void)
> +{
> +	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
> +
> +	pr_info("MPAM enabled\n");

Feels too noisy given it should be easy enough to tell. pr_dbg() perhaps.


> +}

Re: [PATCH v2 10/29] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 3 days, 3 hours ago

Hi Jonathan,

On 11/09/2025 16:07, Jonathan Cameron wrote:
> On Wed, 10 Sep 2025 20:42:50 +0000
> James Morse <james.morse@arm.com> wrote:
> 
>> Because an MSC can only by accessed from the CPUs in its cpu-affinity
>> set we need to be running on one of those CPUs to probe the MSC
>> hardware.
>>
>> Do this work in the cpuhp callback. Probing the hardware will only
>> happen before MPAM is enabled, walk all the MSCs and probe those we can
>> reach that haven't already been probed as each CPU's online call is made.
>>
>> This adds the low-level MSC register accessors.
>>
>> Once all MSCs reported by the firmware have been probed from a CPU in
>> their respective cpu-affinity set, the probe-time cpuhp callbacks are
>> replaced.  The replacement callbacks will ultimately need to handle
>> save/restore of the runtime MSC state across power transitions, but for
>> now there is nothing to do in them: so do nothing.
>>
>> The architecture's context switch code will be enabled by a static-key,
>> this can be set by mpam_enable(), but must be done from process context,
>> not a cpuhp callback because both take the cpuhp lock.
>> Whenever a new MSC has been probed, the mpam_enable() work is scheduled
>> to test if all the MSCs have been probed. If probing fails, mpam_disable()
>> is scheduled to unregister the cpuhp callbacks and free memory.

> Trivial suggestion inline. Either way
> Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>

Thanks!

>> +/* Before mpam is enabled, try to probe new MSC */
>> +static int mpam_discovery_cpu_online(unsigned int cpu)
>> +{
>> +	int err = 0;
>> +	struct mpam_msc *msc;
>> +	bool new_device_probed = false;
>> +
>> +	guard(srcu)(&mpam_srcu);
>> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
>> +				 srcu_read_lock_held(&mpam_srcu)) {
>> +		if (!cpumask_test_cpu(cpu, &msc->accessibility))
>> +			continue;
>> +
>> +		mutex_lock(&msc->probe_lock);
>> +		if (!msc->probed)
>> +			err = mpam_msc_hw_probe(msc);
>> +		mutex_unlock(&msc->probe_lock);
>> +
>> +		if (!err)
>> +			new_device_probed = true;
>> +		else
>> +			break;

> Unless this going to get more complex why not
> 
> 		if (err)
> 			break;
> 
> 		new_device_probed = true;

Sure - its been both simpler and more complex in the past!

>> +	}
>> +
>> +	if (new_device_probed && !err)
>> +		schedule_work(&mpam_enable_work);
>> +	if (err) {
>> +		mpam_disable_reason = "error during probing";
>> +		schedule_work(&mpam_broken_work);
>> +	}
>> +
>> +	return err;
>> +}
> 
>> +static void mpam_enable_once(void)
>> +{
>> +	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
>> +
>> +	pr_info("MPAM enabled\n");

> Feels too noisy given it should be easy enough to tell. pr_dbg() perhaps.

I was aiming for the driver to only print one thing - once all the hardware has been
probed. Once the driver is assembled, this prints the number of PARTID/PMG that were
discovered as the system wide limits.

The reason to print something is that if you see this message, but don't have resctrl
appear in /proc/filesystems - its never going to appear because the resctrl glue code
couldn't find anything it could use. As this isn't an error, so nothing gets printed in
this case.
This is the most common complaint I get - "our platform doesn't look like a Xeon - why
doesn't resctrl work with it?"

It also matters for other requesters, like the SMMU. If they probe after this point, they
can't reduce the PARTID/PMG range - and may get an error and have their MPAM abilities
disabled. Having an entry in the boot log makes this easier to debug.

The alternative would be to keep track of what the driver is up to, and expose that
through debugfs - but information that only exists for debug purposes is likely to be
wrong. It also doesn't help work out what order different drivers tried to probe in.

Thanks,

James