[v1] arm_mpam: Add basic mpam driver

[PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 1 month, 1 week ago

Because an MSC can only by accessed from the CPUs in its cpu-affinity
set we need to be running on one of those CPUs to probe the MSC
hardware.

Do this work in the cpuhp callback. Probing the hardware will only
happen before MPAM is enabled, walk all the MSCs and probe those we can
reach that haven't already been probed.

Later once MPAM is enabled, this cpuhp callback will be replaced by
one that avoids the global list.

Enabling a static key will also take the cpuhp lock, so can't be done
from the cpuhp callback. Whenever a new MSC has been probed schedule
work to test if all the MSCs have now been probed.

CC: Lecopzer Chen <lecopzerc@nvidia.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
 drivers/resctrl/mpam_devices.c  | 144 +++++++++++++++++++++++++++++++-
 drivers/resctrl/mpam_internal.h |   8 +-
 2 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 5baf2a8786fb..9d6516f98acf 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -4,6 +4,7 @@
 #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
 
 #include <linux/acpi.h>
+#include <linux/atomic.h>
 #include <linux/arm_mpam.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
@@ -21,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/workqueue.h>
 
 #include <acpi/pcc.h>
 
@@ -39,6 +41,16 @@ struct srcu_struct mpam_srcu;
 /* MPAM isn't available until all the MSC have been probed. */
 static u32 mpam_num_msc;
 
+static int mpam_cpuhp_state;
+static DEFINE_MUTEX(mpam_cpuhp_state_lock);
+
+/*
+ * mpam is enabled once all devices have been probed from CPU online callbacks,
+ * scheduled via this work_struct. If access to an MSC depends on a CPU that
+ * was not brought online at boot, this can happen surprisingly late.
+ */
+static DECLARE_WORK(mpam_enable_work, &mpam_enable);
+
 /*
  * An MSC is a physical container for controls and monitors, each identified by
  * their RIS index. These share a base-address, interrupts and some MMIO
@@ -78,6 +90,22 @@ LIST_HEAD(mpam_classes);
 /* List of all objects that can be free()d after synchronise_srcu() */
 static LLIST_HEAD(mpam_garbage);
 
+static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg)
+{
+	WARN_ON_ONCE(reg > msc->mapped_hwpage_sz);
+	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
+
+	return readl_relaxed(msc->mapped_hwpage + reg);
+}
+
+static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
+{
+	lockdep_assert_held_once(&msc->part_sel_lock);
+	return __mpam_read_reg(msc, reg);
+}
+
+#define mpam_read_partsel_reg(msc, reg)        _mpam_read_partsel_reg(msc, MPAMF_##reg)
+
 #define init_garbage(x)	init_llist_node(&(x)->garbage.llist)
 
 static struct mpam_vmsc *
@@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 	return err;
 }
 
-static void mpam_discovery_complete(void)
+static int mpam_msc_hw_probe(struct mpam_msc *msc)
+{
+	u64 idr;
+	int err;
+
+	lockdep_assert_held(&msc->probe_lock);
+
+	mutex_lock(&msc->part_sel_lock);
+	idr = mpam_read_partsel_reg(msc, AIDR);
+	if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
+		pr_err_once("%s does not match MPAM architecture v1.x\n",
+			    dev_name(&msc->pdev->dev));
+		err = -EIO;
+	} else {
+		msc->probed = true;
+		err = 0;
+	}
+	mutex_unlock(&msc->part_sel_lock);
+
+	return err;
+}
+
+static int mpam_cpu_online(unsigned int cpu)
 {
-	pr_err("Discovered all MSC\n");
+	return 0;
+}
+
+/* Before mpam is enabled, try to probe new MSC */
+static int mpam_discovery_cpu_online(unsigned int cpu)
+{
+	int err = 0;
+	struct mpam_msc *msc;
+	bool new_device_probed = false;
+
+	mutex_lock(&mpam_list_lock);
+	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
+		if (!cpumask_test_cpu(cpu, &msc->accessibility))
+			continue;
+
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			err = mpam_msc_hw_probe(msc);
+		mutex_unlock(&msc->probe_lock);
+
+		if (!err)
+			new_device_probed = true;
+		else
+			break; // mpam_broken
+	}
+	mutex_unlock(&mpam_list_lock);
+
+	if (new_device_probed && !err)
+		schedule_work(&mpam_enable_work);
+
+	return err;
+}
+
+static int mpam_cpu_offline(unsigned int cpu)
+{
+	return 0;
+}
+
+static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
+					  int (*offline)(unsigned int offline))
+{
+	mutex_lock(&mpam_cpuhp_state_lock);
+	if (mpam_cpuhp_state) {
+		cpuhp_remove_state(mpam_cpuhp_state);
+		mpam_cpuhp_state = 0;
+	}
+
+	mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online",
+					     online, offline);
+	if (mpam_cpuhp_state <= 0) {
+		pr_err("Failed to register cpuhp callbacks");
+		mpam_cpuhp_state = 0;
+	}
+	mutex_unlock(&mpam_cpuhp_state_lock);
 }
 
 static int mpam_dt_count_msc(void)
@@ -772,7 +875,7 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
 	}
 
 	if (!err && fw_num_msc == mpam_num_msc)
-		mpam_discovery_complete();
+		mpam_register_cpuhp_callbacks(&mpam_discovery_cpu_online, NULL);
 
 	if (err && msc)
 		mpam_msc_drv_remove(pdev);
@@ -795,6 +898,41 @@ static struct platform_driver mpam_msc_driver = {
 	.remove = mpam_msc_drv_remove,
 };
 
+static void mpam_enable_once(void)
+{
+	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
+
+	pr_info("MPAM enabled\n");
+}
+
+/*
+ * Enable mpam once all devices have been probed.
+ * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
+ * Also scheduled when new devices are probed when new CPUs come online.
+ */
+void mpam_enable(struct work_struct *work)
+{
+	static atomic_t once;
+	struct mpam_msc *msc;
+	bool all_devices_probed = true;
+
+	/* Have we probed all the hw devices? */
+	mutex_lock(&mpam_list_lock);
+	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
+		mutex_lock(&msc->probe_lock);
+		if (!msc->probed)
+			all_devices_probed = false;
+		mutex_unlock(&msc->probe_lock);
+
+		if (!all_devices_probed)
+			break;
+	}
+	mutex_unlock(&mpam_list_lock);
+
+	if (all_devices_probed && !atomic_fetch_inc(&once))
+		mpam_enable_once();
+}
+
 /*
  * MSC that are hidden under caches are not created as platform devices
  * as there is no cache driver. Caches are also special-cased in
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index 6e0982a1a9ac..a98cca08a2ef 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -49,6 +49,7 @@ struct mpam_msc {
 	 * properties become read-only and the lists are protected by SRCU.
 	 */
 	struct mutex		probe_lock;
+	bool			probed;
 	unsigned long		ris_idxs[128 / BITS_PER_LONG];
 	u32			ris_max;
 
@@ -59,14 +60,14 @@ struct mpam_msc {
 	 * part_sel_lock protects access to the MSC hardware registers that are
 	 * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
 	 * by RIS).
-	 * If needed, take msc->lock first.
+	 * If needed, take msc->probe_lock first.
 	 */
 	struct mutex		part_sel_lock;
 
 	/*
 	 * mon_sel_lock protects access to the MSC hardware registers that are
 	 * affeted by MPAMCFG_MON_SEL.
-	 * If needed, take msc->lock first.
+	 * If needed, take msc->probe_lock first.
 	 */
 	struct mutex		outer_mon_sel_lock;
 	raw_spinlock_t		inner_mon_sel_lock;
@@ -147,6 +148,9 @@ struct mpam_msc_ris {
 extern struct srcu_struct mpam_srcu;
 extern struct list_head mpam_classes;
 
+/* Scheduled work callback to enable mpam once all MSC have been probed */
+void mpam_enable(struct work_struct *work);
+
 int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
 				   cpumask_t *affinity);
 
-- 
2.20.1

Re: [PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by Dave Martin 3 weeks, 3 days ago

Hi James,

While I'm here:

On Fri, Aug 22, 2025 at 03:29:55PM +0000, James Morse wrote:
> Because an MSC can only by accessed from the CPUs in its cpu-affinity
> set we need to be running on one of those CPUs to probe the MSC
> hardware.
> 
> Do this work in the cpuhp callback. Probing the hardware will only
> happen before MPAM is enabled, walk all the MSCs and probe those we can
> reach that haven't already been probed.
> 
> Later once MPAM is enabled, this cpuhp callback will be replaced by
> one that avoids the global list.
> 
> Enabling a static key will also take the cpuhp lock, so can't be done
> from the cpuhp callback. Whenever a new MSC has been probed schedule
> work to test if all the MSCs have now been probed.
> 
> CC: Lecopzer Chen <lecopzerc@nvidia.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  drivers/resctrl/mpam_devices.c  | 144 +++++++++++++++++++++++++++++++-
>  drivers/resctrl/mpam_internal.h |   8 +-
>  2 files changed, 147 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> index 5baf2a8786fb..9d6516f98acf 100644
> --- a/drivers/resctrl/mpam_devices.c
> +++ b/drivers/resctrl/mpam_devices.c

[...]

> @@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
>  	return err;
>  }
>  
> -static void mpam_discovery_complete(void)
> +static int mpam_msc_hw_probe(struct mpam_msc *msc)
> +{
> +	u64 idr;
> +	int err;

Redundant variable which gets removed again in the next patch?
 
> +
> +	lockdep_assert_held(&msc->probe_lock);
> +
> +	mutex_lock(&msc->part_sel_lock);
> +	idr = mpam_read_partsel_reg(msc, AIDR);
> +	if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
> +		pr_err_once("%s does not match MPAM architecture v1.x\n",
> +			    dev_name(&msc->pdev->dev));
> +		err = -EIO;
> +	} else {
> +		msc->probed = true;
> +		err = 0;
> +	}
> +	mutex_unlock(&msc->part_sel_lock);
> +
> +	return err;
> +}

[...]

Cheers
---Dave

Re: [PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by Dave Martin 4 weeks ago

Hi James,

On Fri, Aug 22, 2025 at 03:29:55PM +0000, James Morse wrote:
> Because an MSC can only by accessed from the CPUs in its cpu-affinity
> set we need to be running on one of those CPUs to probe the MSC
> hardware.
> 
> Do this work in the cpuhp callback. Probing the hardware will only
> happen before MPAM is enabled, walk all the MSCs and probe those we can
> reach that haven't already been probed.

It may be worth mentioning that the low-level MSC register accessors
are added by this patch.

> Later once MPAM is enabled, this cpuhp callback will be replaced by
> one that avoids the global list.

I misread this is as meaning "later in the patch series" and got
confused.

Perhaps, something like the following? (though this got a bit verbose)

--8<--

Once all MSCs reported by the firmware have been probed from a CPU in
their respective cpu-affinity set, the probe-time cpuhp callbacks are
replaced.  The replacement callbacks will ultimately need to handle
save/restore of the runtime MSC state across power transitions, but for
now there is nothing to do in them: so do nothing.

-->8--

> Enabling a static key will also take the cpuhp lock, so can't be done

What static key?

None in this patch that I can see.

> from the cpuhp callback. Whenever a new MSC has been probed schedule
> work to test if all the MSCs have now been probed.
> 
> CC: Lecopzer Chen <lecopzerc@nvidia.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  drivers/resctrl/mpam_devices.c  | 144 +++++++++++++++++++++++++++++++-
>  drivers/resctrl/mpam_internal.h |   8 +-
>  2 files changed, 147 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c

[...]

> @@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,

[...]

> +static int mpam_cpu_online(unsigned int cpu)
>  {
> -	pr_err("Discovered all MSC\n");

I guess this disappears later?

If we print anything, it feels like it should be in the
mpam_enable_once() path, otherwise it looks like dmesg is going to get
spammed on every hotplug.  I might have missed something, here.

> +	return 0;
> +}
> +
> +/* Before mpam is enabled, try to probe new MSC */
> +static int mpam_discovery_cpu_online(unsigned int cpu)
> +{
> +	int err = 0;
> +	struct mpam_msc *msc;
> +	bool new_device_probed = false;
> +
> +	mutex_lock(&mpam_list_lock);

I take it nothing breaks if we sleep here?

Pending cpuhp callbacks for this CPU look to be blocked while we sleep,
at the very least.

Since this only happens during the probing phase, maybe that's not such
a big deal.

Is it likely that some late CPUs might be left offline indefinitely?

If so, we might end up doing futile work here forever.

> +	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
> +		if (!cpumask_test_cpu(cpu, &msc->accessibility))
> +			continue;
> +
> +		mutex_lock(&msc->probe_lock);
> +		if (!msc->probed)
> +			err = mpam_msc_hw_probe(msc);
> +		mutex_unlock(&msc->probe_lock);
> +
> +		if (!err)
> +			new_device_probed = true;
> +		else
> +			break; // mpam_broken

What's the effect of returning a non-zero value to the CPU hotplug
callback dispatcher here?

Do we want to tear anything down if MPAM is unusable?

> +	}
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (new_device_probed && !err)
> +		schedule_work(&mpam_enable_work);
> +
> +	return err;
> +}
> +
> +static int mpam_cpu_offline(unsigned int cpu)
> +{
> +	return 0;
> +}
> +
> +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
> +					  int (*offline)(unsigned int offline))
> +{
> +	mutex_lock(&mpam_cpuhp_state_lock);
> +	if (mpam_cpuhp_state) {
> +		cpuhp_remove_state(mpam_cpuhp_state);
> +		mpam_cpuhp_state = 0;
> +	}
> +
> +	mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online",
> +					     online, offline);
> +	if (mpam_cpuhp_state <= 0) {
> +		pr_err("Failed to register cpuhp callbacks");

Should an error code be returned to the caller if this fails?

> +		mpam_cpuhp_state = 0;
> +	}
> +	mutex_unlock(&mpam_cpuhp_state_lock);
>  }
>  
>  static int mpam_dt_count_msc(void)
> @@ -772,7 +875,7 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
>  	}
>  
>  	if (!err && fw_num_msc == mpam_num_msc)
> -		mpam_discovery_complete();
> +		mpam_register_cpuhp_callbacks(&mpam_discovery_cpu_online, NULL);

Abandon probing the MSC if this fails?

(However, the next phase of probing hangs off CPU hotplug, so it just
won't happen if the callbacks can't be registered -- but it looks like
MPAM may be left in a half-probed state.  I'm not entirely convinced
that this matters if the MPAM driver is not unloadable anyway...)

Nit: redundant &

(You don't have it in the similar call in mpam_enable_once().)

>  
>  	if (err && msc)
>  		mpam_msc_drv_remove(pdev);
> @@ -795,6 +898,41 @@ static struct platform_driver mpam_msc_driver = {
>  	.remove = mpam_msc_drv_remove,
>  };
>  
> +static void mpam_enable_once(void)
> +{
> +	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);

Should it be fatal if this fails?

> +
> +	pr_info("MPAM enabled\n");
> +}
> +
> +/*
> + * Enable mpam once all devices have been probed.
> + * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
> + * Also scheduled when new devices are probed when new CPUs come online.
> + */
> +void mpam_enable(struct work_struct *work)
> +{
> +	static atomic_t once;

Nit: possibly unnecessary atomic_t?  This is slow-path code, and we
already have to take mpam_list_lock.  Harmless, though.

> +	struct mpam_msc *msc;
> +	bool all_devices_probed = true;
> +
> +	/* Have we probed all the hw devices? */
> +	mutex_lock(&mpam_list_lock);
> +	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
> +		mutex_lock(&msc->probe_lock);
> +		if (!msc->probed)
> +			all_devices_probed = false;
> +		mutex_unlock(&msc->probe_lock);
> +
> +		if (!all_devices_probed)
> +			break;

WARN()?

We counted the MSCs in via the mpam_discovery_cpu_online(), so I think
we shouldn't get in here if some failed to probe?

> +	}
> +	mutex_unlock(&mpam_list_lock);
> +
> +	if (all_devices_probed && !atomic_fetch_inc(&once))
> +		mpam_enable_once();
> +}

[...]

Cheers
---Dave

Re: [PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 3 weeks, 3 days ago

Hi Dave,

On 05/09/2025 17:40, Dave Martin wrote:
> On Fri, Aug 22, 2025 at 03:29:55PM +0000, James Morse wrote:
>> Because an MSC can only by accessed from the CPUs in its cpu-affinity
>> set we need to be running on one of those CPUs to probe the MSC
>> hardware.
>>
>> Do this work in the cpuhp callback. Probing the hardware will only
>> happen before MPAM is enabled, walk all the MSCs and probe those we can
>> reach that haven't already been probed.
> 
> It may be worth mentioning that the low-level MSC register accessors
> are added by this patch.

Sure,

>> Later once MPAM is enabled, this cpuhp callback will be replaced by
>> one that avoids the global list.
> 
> I misread this is as meaning "later in the patch series" and got
> confused.
> 
> Perhaps, something like the following? (though this got a bit verbose)
> 
> --8<--
> 
> Once all MSCs reported by the firmware have been probed from a CPU in
> their respective cpu-affinity set, the probe-time cpuhp callbacks are
> replaced.  The replacement callbacks will ultimately need to handle
> save/restore of the runtime MSC state across power transitions, but for
> now there is nothing to do in them: so do nothing.
> 
> -->8--

Done.

>> Enabling a static key will also take the cpuhp lock, so can't be done
> 
> What static key?

The one that enables the architectures context-switch code. That was added by an
earlier patch, but got moved later to reduce the number of trees that this series touches.
(also, there is no point having the context switch code if you can't have different values
until the resctrl code shows up.)

This is trying to describe why mpam_enable() is scheduled, instead of just being called in
cpuhp context. (again - to reduce the churn caused by changing that later).

I'll rephrase it as:
| The architecture's context switch code will be enabled by a static-key, this can be set
| by mpam_enable(), but must be done from process context, not a cpuhp callback because
| both take the cpuhp lock.
| Whenever a new MSC has been probed, the mpam_enable() work is scheduled to test if all
| the MSCs have been probed.

> None in this patch that I can see.
> 
>> from the cpuhp callback. Whenever a new MSC has been probed schedule
>> work to test if all the MSCs have now been probed.

>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> 
> [...]
> 
>> @@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
> 
> [...]
> 
>> +static int mpam_cpu_online(unsigned int cpu)
>>  {
>> -	pr_err("Discovered all MSC\n");
> 
> I guess this disappears later?
> 
> If we print anything, it feels like it should be in the
> mpam_enable_once() path, otherwise it looks like dmesg is going to get
> spammed on every hotplug.  I might have missed something, here.

Yes - there is a print that happens on the mpam_enable_once() path that shows the number
of PARTID/PMG discovered. That "Discovered all MSC" message was so that probing had this
shape from the very beginning - it is unfortunately not as simple as probing a stand-alone
driver.

>> +	return 0;
>> +}
>> +
>> +/* Before mpam is enabled, try to probe new MSC */
>> +static int mpam_discovery_cpu_online(unsigned int cpu)
>> +{
>> +	int err = 0;
>> +	struct mpam_msc *msc;
>> +	bool new_device_probed = false;
>> +
>> +	mutex_lock(&mpam_list_lock);

> I take it nothing breaks if we sleep here?

From memory, callbacks registered at CPUHP_AP_ONLINE_DYN are allowed to sleep. There is
some point in the state machine where you can't. I can't find where this comes from right
now...
e.g. resctrl does this in x86's resctrl_arch_online_cpu() and friends.

> Pending cpuhp callbacks for this CPU look to be blocked while we sleep,
> at the very least.
> Since this only happens during the probing phase, maybe that's not such
> a big deal.

> Is it likely that some late CPUs might be left offline indefinitely?

Offlined and never come back is certainly something that can happen.

> If so, we might end up doing futile work here forever.

It may never probe all the MSC? Yes, that can certainly happen.
But the work only happens when CPUs come online, which is already a major serialising
event. There is no cost to run in this 'not done yet' state forever. It's not retrying on
a timer or something like that.

>> +	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
>> +		if (!cpumask_test_cpu(cpu, &msc->accessibility))
>> +			continue;
>> +
>> +		mutex_lock(&msc->probe_lock);
>> +		if (!msc->probed)
>> +			err = mpam_msc_hw_probe(msc);
>> +		mutex_unlock(&msc->probe_lock);
>> +
>> +		if (!err)
>> +			new_device_probed = true;
>> +		else
>> +			break; // mpam_broken

> What's the effect of returning a non-zero value to the CPU hotplug
> callback dispatcher here?

I think the dynamically allocated ones can fail without any ill effects, it'll print a
message but nothing else will happen. In this case the callbacks are synchronous with the
attempt to register them, so it propagates the error back there.

> Do we want to tear anything down if MPAM is unusable?

It will keep trying, whereas it could pack up shop completely.

Looks like that '// mpam_broken' is where I intended to schedule the work, but the code
doesn't exist this early in the series. I'll pull bits of that earlier.

>> +	}
>> +	mutex_unlock(&mpam_list_lock);
>> +
>> +	if (new_device_probed && !err)
>> +		schedule_work(&mpam_enable_work);
>> +
>> +	return err;
>> +}
>> +
>> +static int mpam_cpu_offline(unsigned int cpu)
>> +{
>> +	return 0;
>> +}
>> +
>> +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
>> +					  int (*offline)(unsigned int offline))
>> +{
>> +	mutex_lock(&mpam_cpuhp_state_lock);
>> +	if (mpam_cpuhp_state) {
>> +		cpuhp_remove_state(mpam_cpuhp_state);
>> +		mpam_cpuhp_state = 0;
>> +	}
>> +
>> +	mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online",
>> +					     online, offline);
>> +	if (mpam_cpuhp_state <= 0) {
>> +		pr_err("Failed to register cpuhp callbacks");

> Should an error code be returned to the caller if this fails?

It can fail asynchronously - so any error handling for this is only solves part of the
problem, a failure here means at least one of the callbacks ran and returned an error.
If we schedule the disable call in the callback then that will take care of tearing the
whole thing down.
The cpuhp callbacks don't get registered until all the driver has found all the MSC that
firmware described, so there is no race with driver:probing an MSC after the disable call
got scheduled which tears the whole thing down.

>> +		mpam_cpuhp_state = 0;
>> +	}
>> +	mutex_unlock(&mpam_cpuhp_state_lock);
>>  }
>>  
>>  static int mpam_dt_count_msc(void)
>> @@ -772,7 +875,7 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
>>  	}
>>  
>>  	if (!err && fw_num_msc == mpam_num_msc)
>> -		mpam_discovery_complete();
>> +		mpam_register_cpuhp_callbacks(&mpam_discovery_cpu_online, NULL);
> 
> Abandon probing the MSC if this fails?

Any error returned here is most likely to be from mpam_discovery_cpu_online(), but that
can also happen asynchronously. Scheduling mpam_disable() is a better approach as it
covers the asynchronous case too.

> (However, the next phase of probing hangs off CPU hotplug, so it just
> won't happen if the callbacks can't be registered -- but it looks like
> MPAM may be left in a half-probed state.  I'm not entirely convinced
> that this matters if the MPAM driver is not unloadable anyway...)

I'm not at all worried about failing to register the cpuhp callbacks. The space needed for
that is pre-allocated, if there isn't enough space, you get a splat from the cpuhp core -
and the callbacks never run. No additional/unnecessary work happens - sure the memory
didn't get free'd, but the WARN() from cpuhp_reserve_state() should be enough to debug this.

> Nit: redundant &
> 
> (You don't have it in the similar call in mpam_enable_once().)

Done,

>>  	if (err && msc)
>>  		mpam_msc_drv_remove(pdev);
>> @@ -795,6 +898,41 @@ static struct platform_driver mpam_msc_driver = {
>>  	.remove = mpam_msc_drv_remove,
>>  };
>>  
>> +static void mpam_enable_once(void)
>> +{
>> +	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
> 
> Should it be fatal if this fails?

As above. Once case doesn't matter - and any handling here is incomplete.

>> +
>> +	pr_info("MPAM enabled\n");
>> +}
>> +
>> +/*
>> + * Enable mpam once all devices have been probed.
>> + * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
>> + * Also scheduled when new devices are probed when new CPUs come online.
>> + */
>> +void mpam_enable(struct work_struct *work)
>> +{
>> +	static atomic_t once;
> 
> Nit: possibly unnecessary atomic_t?  This is slow-path code, and we
> already have to take mpam_list_lock.  Harmless, though.

mpam_enable_once() can't be called under the lock because of the ordering with cpuhp. This
just ended up being cleaner. Too much is stuffed under that lock already!

>> +	struct mpam_msc *msc;
>> +	bool all_devices_probed = true;
>> +
>> +	/* Have we probed all the hw devices? */
>> +	mutex_lock(&mpam_list_lock);
>> +	list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
>> +		mutex_lock(&msc->probe_lock);
>> +		if (!msc->probed)
>> +			all_devices_probed = false;
>> +		mutex_unlock(&msc->probe_lock);
>> +
>> +		if (!all_devices_probed)
>> +			break;
> 
> WARN()?

Expected condition...

> We counted the MSCs in via the mpam_discovery_cpu_online(), so I think
> we shouldn't get in here if some failed to probe?

No we didn't!
We counted them in mpam_msc_drv_probe() before registering the cpuhp callbacks.
The cpuhp callbacks run asynchronously, each one schedules mpam_enable() iff it probed a
new device. mpam_enable() then has to check if all the devices had been probed.

This is done so that mpam_discovery_cpu_online() only has to take the msc->probe_lock for
MSC that it can access - instead of all of them. That was to avoid having something that
serialises CPUs coming online ... but mpam_discovery_cpu_online() is taking the list_lock
due to an incomplete switch to SRCU. I'll fix that.

>> +	}
>> +	mutex_unlock(&mpam_list_lock);
>> +
>> +	if (all_devices_probed && !atomic_fetch_inc(&once))
>> +		mpam_enable_once();
>> +}

Thanks,

James

Re: [PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by Rob Herring 1 month, 1 week ago

On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
>
> Because an MSC can only by accessed from the CPUs in its cpu-affinity
> set we need to be running on one of those CPUs to probe the MSC
> hardware.
>
> Do this work in the cpuhp callback. Probing the hardware will only
> happen before MPAM is enabled, walk all the MSCs and probe those we can
> reach that haven't already been probed.
>
> Later once MPAM is enabled, this cpuhp callback will be replaced by
> one that avoids the global list.
>
> Enabling a static key will also take the cpuhp lock, so can't be done
> from the cpuhp callback. Whenever a new MSC has been probed schedule
> work to test if all the MSCs have now been probed.
>
> CC: Lecopzer Chen <lecopzerc@nvidia.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  drivers/resctrl/mpam_devices.c  | 144 +++++++++++++++++++++++++++++++-
>  drivers/resctrl/mpam_internal.h |   8 +-
>  2 files changed, 147 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> index 5baf2a8786fb..9d6516f98acf 100644
> --- a/drivers/resctrl/mpam_devices.c
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -4,6 +4,7 @@
>  #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
>
>  #include <linux/acpi.h>
> +#include <linux/atomic.h>
>  #include <linux/arm_mpam.h>
>  #include <linux/cacheinfo.h>
>  #include <linux/cpu.h>
> @@ -21,6 +22,7 @@
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
>  #include <linux/types.h>
> +#include <linux/workqueue.h>
>
>  #include <acpi/pcc.h>
>
> @@ -39,6 +41,16 @@ struct srcu_struct mpam_srcu;
>  /* MPAM isn't available until all the MSC have been probed. */
>  static u32 mpam_num_msc;
>
> +static int mpam_cpuhp_state;
> +static DEFINE_MUTEX(mpam_cpuhp_state_lock);
> +
> +/*
> + * mpam is enabled once all devices have been probed from CPU online callbacks,
> + * scheduled via this work_struct. If access to an MSC depends on a CPU that
> + * was not brought online at boot, this can happen surprisingly late.
> + */
> +static DECLARE_WORK(mpam_enable_work, &mpam_enable);
> +
>  /*
>   * An MSC is a physical container for controls and monitors, each identified by
>   * their RIS index. These share a base-address, interrupts and some MMIO
> @@ -78,6 +90,22 @@ LIST_HEAD(mpam_classes);
>  /* List of all objects that can be free()d after synchronise_srcu() */
>  static LLIST_HEAD(mpam_garbage);
>
> +static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg)
> +{
> +       WARN_ON_ONCE(reg > msc->mapped_hwpage_sz);
> +       WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));

These either make __mpam_read_reg uninlined or add 2 checks to every
register read. Neither seems very good.

> +
> +       return readl_relaxed(msc->mapped_hwpage + reg);
> +}
> +
> +static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
> +{
> +       lockdep_assert_held_once(&msc->part_sel_lock);

Similar thing here.

> +       return __mpam_read_reg(msc, reg);
> +}
> +
> +#define mpam_read_partsel_reg(msc, reg)        _mpam_read_partsel_reg(msc, MPAMF_##reg)
> +
>  #define init_garbage(x)        init_llist_node(&(x)->garbage.llist)
>
>  static struct mpam_vmsc *
> @@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
>         return err;
>  }
>
> -static void mpam_discovery_complete(void)

It is annoying to review things which disappear in later patches...


> +static int mpam_msc_hw_probe(struct mpam_msc *msc)
> +{
> +       u64 idr;
> +       int err;
> +
> +       lockdep_assert_held(&msc->probe_lock);
> +
> +       mutex_lock(&msc->part_sel_lock);
> +       idr = mpam_read_partsel_reg(msc, AIDR);

I don't think AIDR access depends on PART_SEL.

> +       if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
> +               pr_err_once("%s does not match MPAM architecture v1.x\n",
> +                           dev_name(&msc->pdev->dev));
> +               err = -EIO;
> +       } else {
> +               msc->probed = true;
> +               err = 0;
> +       }
> +       mutex_unlock(&msc->part_sel_lock);
> +
> +       return err;
> +}
> +
> +static int mpam_cpu_online(unsigned int cpu)
>  {
> -       pr_err("Discovered all MSC\n");
> +       return 0;
> +}
> +
> +/* Before mpam is enabled, try to probe new MSC */
> +static int mpam_discovery_cpu_online(unsigned int cpu)
> +{
> +       int err = 0;
> +       struct mpam_msc *msc;
> +       bool new_device_probed = false;
> +
> +       mutex_lock(&mpam_list_lock);
> +       list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
> +               if (!cpumask_test_cpu(cpu, &msc->accessibility))
> +                       continue;
> +
> +               mutex_lock(&msc->probe_lock);
> +               if (!msc->probed)
> +                       err = mpam_msc_hw_probe(msc);
> +               mutex_unlock(&msc->probe_lock);
> +
> +               if (!err)
> +                       new_device_probed = true;
> +               else
> +                       break; // mpam_broken
> +       }
> +       mutex_unlock(&mpam_list_lock);
> +
> +       if (new_device_probed && !err)
> +               schedule_work(&mpam_enable_work);
> +
> +       return err;
> +}
> +
> +static int mpam_cpu_offline(unsigned int cpu)
> +{
> +       return 0;
> +}
> +
> +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
> +                                         int (*offline)(unsigned int offline))
> +{
> +       mutex_lock(&mpam_cpuhp_state_lock);
> +       if (mpam_cpuhp_state) {
> +               cpuhp_remove_state(mpam_cpuhp_state);
> +               mpam_cpuhp_state = 0;
> +       }
> +
> +       mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mpam:online",
> +                                            online, offline);
> +       if (mpam_cpuhp_state <= 0) {
> +               pr_err("Failed to register cpuhp callbacks");
> +               mpam_cpuhp_state = 0;
> +       }
> +       mutex_unlock(&mpam_cpuhp_state_lock);
>  }
>
>  static int mpam_dt_count_msc(void)
> @@ -772,7 +875,7 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
>         }
>
>         if (!err && fw_num_msc == mpam_num_msc)
> -               mpam_discovery_complete();
> +               mpam_register_cpuhp_callbacks(&mpam_discovery_cpu_online, NULL);
>
>         if (err && msc)
>                 mpam_msc_drv_remove(pdev);
> @@ -795,6 +898,41 @@ static struct platform_driver mpam_msc_driver = {
>         .remove = mpam_msc_drv_remove,
>  };
>
> +static void mpam_enable_once(void)
> +{
> +       mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
> +
> +       pr_info("MPAM enabled\n");
> +}
> +
> +/*
> + * Enable mpam once all devices have been probed.
> + * Scheduled by mpam_discovery_cpu_online() once all devices have been created.
> + * Also scheduled when new devices are probed when new CPUs come online.
> + */
> +void mpam_enable(struct work_struct *work)
> +{
> +       static atomic_t once;
> +       struct mpam_msc *msc;
> +       bool all_devices_probed = true;
> +
> +       /* Have we probed all the hw devices? */
> +       mutex_lock(&mpam_list_lock);
> +       list_for_each_entry(msc, &mpam_all_msc, glbl_list) {
> +               mutex_lock(&msc->probe_lock);
> +               if (!msc->probed)
> +                       all_devices_probed = false;
> +               mutex_unlock(&msc->probe_lock);
> +
> +               if (!all_devices_probed)
> +                       break;
> +       }
> +       mutex_unlock(&mpam_list_lock);
> +
> +       if (all_devices_probed && !atomic_fetch_inc(&once))
> +               mpam_enable_once();
> +}
> +
>  /*
>   * MSC that are hidden under caches are not created as platform devices
>   * as there is no cache driver. Caches are also special-cased in
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> index 6e0982a1a9ac..a98cca08a2ef 100644
> --- a/drivers/resctrl/mpam_internal.h
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -49,6 +49,7 @@ struct mpam_msc {
>          * properties become read-only and the lists are protected by SRCU.
>          */
>         struct mutex            probe_lock;
> +       bool                    probed;
>         unsigned long           ris_idxs[128 / BITS_PER_LONG];
>         u32                     ris_max;
>
> @@ -59,14 +60,14 @@ struct mpam_msc {
>          * part_sel_lock protects access to the MSC hardware registers that are
>          * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>          * by RIS).
> -        * If needed, take msc->lock first.
> +        * If needed, take msc->probe_lock first.

Humm. I think this belongs in patch 10.

>          */
>         struct mutex            part_sel_lock;
>
>         /*
>          * mon_sel_lock protects access to the MSC hardware registers that are
>          * affeted by MPAMCFG_MON_SEL.
> -        * If needed, take msc->lock first.
> +        * If needed, take msc->probe_lock first.
>          */
>         struct mutex            outer_mon_sel_lock;
>         raw_spinlock_t          inner_mon_sel_lock;
> @@ -147,6 +148,9 @@ struct mpam_msc_ris {
>  extern struct srcu_struct mpam_srcu;
>  extern struct list_head mpam_classes;
>
> +/* Scheduled work callback to enable mpam once all MSC have been probed */
> +void mpam_enable(struct work_struct *work);
> +
>  int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
>                                    cpumask_t *affinity);
>
> --
> 2.20.1
>

Re: [PATCH 14/33] arm_mpam: Add cpuhp callbacks to probe MSC hardware

Posted by James Morse 3 weeks, 4 days ago

Hi Rob,

On 27/08/2025 17:08, Rob Herring wrote:
> On Fri, Aug 22, 2025 at 10:32 AM James Morse <james.morse@arm.com> wrote:
>>
>> Because an MSC can only by accessed from the CPUs in its cpu-affinity
>> set we need to be running on one of those CPUs to probe the MSC
>> hardware.
>>
>> Do this work in the cpuhp callback. Probing the hardware will only
>> happen before MPAM is enabled, walk all the MSCs and probe those we can
>> reach that haven't already been probed.
>>
>> Later once MPAM is enabled, this cpuhp callback will be replaced by
>> one that avoids the global list.
>>
>> Enabling a static key will also take the cpuhp lock, so can't be done
>> from the cpuhp callback. Whenever a new MSC has been probed schedule
>> work to test if all the MSCs have now been probed.

>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>> index 5baf2a8786fb..9d6516f98acf 100644
>> --- a/drivers/resctrl/mpam_devices.c
>> +++ b/drivers/resctrl/mpam_devices.c
>> @@ -78,6 +90,22 @@ LIST_HEAD(mpam_classes);
>>  /* List of all objects that can be free()d after synchronise_srcu() */
>>  static LLIST_HEAD(mpam_garbage);
>>
>> +static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg)
>> +{
>> +       WARN_ON_ONCE(reg > msc->mapped_hwpage_sz);
>> +       WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility));
> 
> These either make __mpam_read_reg uninlined or add 2 checks to every
> register read. Neither seems very good.

The mapping-bounds one is from before the ACPI table had the size of the mapping. I can
remove that one now.

The accessibility one really needs checking as getting this wrong will only occasionally
cause a deadlock if you get unlucky with power-management. I don't think we'd ever manage
to debug that, hence the check. The server platforms we'll see first aren't going to
bother with PSCI CPU_SUSPEND - but mobile devices will.

If the compiler choses not to inline this, I'm fine with that. Accesses to the device
mapped configuration are rare, and always via a resctrl filesystem access. I don't think
the performance matters.


>> +
>> +       return readl_relaxed(msc->mapped_hwpage + reg);
>> +}
>> +
>> +static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg)
>> +{
>> +       lockdep_assert_held_once(&msc->part_sel_lock);
> 
> Similar thing here.

If don't build with lockdep this costs you nothing.


>> +       return __mpam_read_reg(msc, reg);
>> +}
>> +
>> +#define mpam_read_partsel_reg(msc, reg)        _mpam_read_partsel_reg(msc, MPAMF_##reg)
>> +
>>  #define init_garbage(x)        init_llist_node(&(x)->garbage.llist)
>>
>>  static struct mpam_vmsc *
>> @@ -511,9 +539,84 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
>>         return err;
>>  }
>>
>> -static void mpam_discovery_complete(void)
> 
> It is annoying to review things which disappear in later patches...

It's a printk - its purpose was to show something happens once all the MSC have been
probed. That was supposed to make it easier to review as it has always has this shape from
the beginning. This patch adds the hardware accesses that do the probing, which happen
from cpuhp calls - which in turn moves this 'complete' work to be scheduled.
As this seems to be causing confusion I'll inline it so it doesn't look strange.


>> +static int mpam_msc_hw_probe(struct mpam_msc *msc)
>> +{
>> +       u64 idr;
>> +       int err;
>> +
>> +       lockdep_assert_held(&msc->probe_lock);
>> +
>> +       mutex_lock(&msc->part_sel_lock);
>> +       idr = mpam_read_partsel_reg(msc, AIDR);
> 
> I don't think AIDR access depends on PART_SEL.

It doesn't, but as most registers do, it was just simpler to pretend it does.

I'll shove the __version in here instead, which will save taking the lock.


>> +       if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) {
>> +               pr_err_once("%s does not match MPAM architecture v1.x\n",
>> +                           dev_name(&msc->pdev->dev));
>> +               err = -EIO;
>> +       } else {
>> +               msc->probed = true;
>> +               err = 0;
>> +       }
>> +       mutex_unlock(&msc->part_sel_lock);
>> +
>> +       return err;
>> +}

>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> index 6e0982a1a9ac..a98cca08a2ef 100644
>> --- a/drivers/resctrl/mpam_internal.h
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -59,14 +60,14 @@ struct mpam_msc {
>>          * part_sel_lock protects access to the MSC hardware registers that are
>>          * affected by MPAMCFG_PART_SEL. (including the ID registers that vary
>>          * by RIS).
>> -        * If needed, take msc->lock first.
>> +        * If needed, take msc->probe_lock first.
> 
> Humm. I think this belongs in patch 10.

Yup, fixed.


Thanks,

James