[PATCH v2 18/29] arm_mpam: Register and enable IRQs

James Morse posted 29 patches 4 months, 4 weeks ago
There is a newer version of this series
[PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by James Morse 4 months, 4 weeks ago
Register and enable error IRQs. All the MPAM error interrupts indicate a
software bug, e.g. out of range partid. If the error interrupt is ever
signalled, attempt to disable MPAM.

Only the irq handler accesses the ESR register, so no locking is needed.
The work to disable MPAM after an error needs to happen at process
context as it takes mutex. It also unregisters the interrupts, meaning
it can't be done from the threaded part of a threaded interrupt.
Instead, mpam_disable() gets scheduled.

Enabling the IRQs in the MSC may involve cross calling to a CPU that
can access the MSC.

Once the IRQ is requested, the mpam_disable() path can be called
asynchronously, which will walk structures sized by max_partid. Ensure
this size is fixed before the interrupt is requested.

CC: Rohit Mathew <rohit.mathew@arm.com>
Tested-by: Rohit Mathew <rohit.mathew@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
Changes since v1:
 * Made mpam_unregister_irqs() safe to race with itself.
 * Removed threaded interrupts.
 * Schedule mpam_disable() from cpuhp callback in the case of an error.
 * Added mpam_disable_reason.
 * Use alloc_percpu()

Changes since RFC:
 * Use guard marco when walking srcu list.
 * Use INTEN macro for enabling interrupts.
 * Move partid_max_published up earlier in mpam_enable_once().
---
 drivers/resctrl/mpam_devices.c  | 277 +++++++++++++++++++++++++++++++-
 drivers/resctrl/mpam_internal.h |  10 ++
 2 files changed, 284 insertions(+), 3 deletions(-)

diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index a9d3c4b09976..e7e4afc1ea95 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -14,6 +14,9 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/mutex.h>
@@ -166,6 +169,24 @@ static u64 mpam_msc_read_idr(struct mpam_msc *msc)
 	return (idr_high << 32) | idr_low;
 }
 
+static void mpam_msc_zero_esr(struct mpam_msc *msc)
+{
+	__mpam_write_reg(msc, MPAMF_ESR, 0);
+	if (msc->has_extd_esr)
+		__mpam_write_reg(msc, MPAMF_ESR + 4, 0);
+}
+
+static u64 mpam_msc_read_esr(struct mpam_msc *msc)
+{
+	u64 esr_high = 0, esr_low;
+
+	esr_low = __mpam_read_reg(msc, MPAMF_ESR);
+	if (msc->has_extd_esr)
+		esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4);
+
+	return (esr_high << 32) | esr_low;
+}
+
 static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc)
 {
 	lockdep_assert_held(&msc->part_sel_lock);
@@ -754,6 +775,7 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
 		pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr);
 		msc->partid_max = min(msc->partid_max, partid_max);
 		msc->pmg_max = min(msc->pmg_max, pmg_max);
+		msc->has_extd_esr = FIELD_GET(MPAMF_IDR_HAS_EXTD_ESR, idr);
 
 		mutex_lock(&mpam_list_lock);
 		ris = mpam_get_or_create_ris(msc, ris_idx);
@@ -768,6 +790,9 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
 		mutex_unlock(&msc->part_sel_lock);
 	}
 
+	/* Clear any stale errors */
+	mpam_msc_zero_esr(msc);
+
 	spin_lock(&partid_max_lock);
 	mpam_partid_max = min(mpam_partid_max, msc->partid_max);
 	mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max);
@@ -895,6 +920,13 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online)
 	}
 }
 
+static void _enable_percpu_irq(void *_irq)
+{
+	int *irq = _irq;
+
+	enable_percpu_irq(*irq, IRQ_TYPE_NONE);
+}
+
 static int mpam_cpu_online(unsigned int cpu)
 {
 	int idx;
@@ -906,6 +938,9 @@ static int mpam_cpu_online(unsigned int cpu)
 		if (!cpumask_test_cpu(cpu, &msc->accessibility))
 			continue;
 
+		if (msc->reenable_error_ppi)
+			_enable_percpu_irq(&msc->reenable_error_ppi);
+
 		if (atomic_fetch_inc(&msc->online_refs) == 0)
 			mpam_reset_msc(msc, true);
 	}
@@ -959,6 +994,9 @@ static int mpam_cpu_offline(unsigned int cpu)
 		if (!cpumask_test_cpu(cpu, &msc->accessibility))
 			continue;
 
+		if (msc->reenable_error_ppi)
+			disable_percpu_irq(msc->reenable_error_ppi);
+
 		if (atomic_dec_and_test(&msc->online_refs))
 			mpam_reset_msc(msc, false);
 	}
@@ -985,6 +1023,51 @@ static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online),
 	mutex_unlock(&mpam_cpuhp_state_lock);
 }
 
+static int __setup_ppi(struct mpam_msc *msc)
+{
+	int cpu;
+	struct device *dev = &msc->pdev->dev;
+
+	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
+	if (!msc->error_dev_id)
+		return -ENOMEM;
+
+	for_each_cpu(cpu, &msc->accessibility) {
+		struct mpam_msc *empty = *per_cpu_ptr(msc->error_dev_id, cpu);
+
+		if (empty) {
+			dev_err_once(dev, "MSC shares PPI with %s!\n",
+				     dev_name(&empty->pdev->dev));
+			return -EBUSY;
+		}
+		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
+	}
+
+	return 0;
+}
+
+static int mpam_msc_setup_error_irq(struct mpam_msc *msc)
+{
+	int irq;
+
+	irq = platform_get_irq_byname_optional(msc->pdev, "error");
+	if (irq <= 0)
+		return 0;
+
+	/* Allocate and initialise the percpu device pointer for PPI */
+	if (irq_is_percpu(irq))
+		return __setup_ppi(msc);
+
+	/* sanity check: shared interrupts can be routed anywhere? */
+	if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) {
+		pr_err_once("msc:%u is a private resource with a shared error interrupt",
+			    msc->id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
  * An MSC can control traffic from a set of CPUs, but may only be accessible
  * from a (hopefully wider) set of CPUs. The common reason for this is power
@@ -1060,6 +1143,10 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
 			break;
 		}
 
+		err = mpam_msc_setup_error_irq(msc);
+		if (err)
+			break;
+
 		if (device_property_read_u32(&pdev->dev, "pcc-channel",
 					     &msc->pcc_subspace_id))
 			msc->iface = MPAM_IFACE_MMIO;
@@ -1318,11 +1405,172 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list)
 	}
 }
 
+static char *mpam_errcode_names[16] = {
+	[0] = "No error",
+	[1] = "PARTID_SEL_Range",
+	[2] = "Req_PARTID_Range",
+	[3] = "MSMONCFG_ID_RANGE",
+	[4] = "Req_PMG_Range",
+	[5] = "Monitor_Range",
+	[6] = "intPARTID_Range",
+	[7] = "Unexpected_INTERNAL",
+	[8] = "Undefined_RIS_PART_SEL",
+	[9] = "RIS_No_Control",
+	[10] = "Undefined_RIS_MON_SEL",
+	[11] = "RIS_No_Monitor",
+	[12 ... 15] = "Reserved"
+};
+
+static int mpam_enable_msc_ecr(void *_msc)
+{
+	struct mpam_msc *msc = _msc;
+
+	__mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN);
+
+	return 0;
+}
+
+/* This can run in mpam_disable(), and the interrupt handler on the same CPU */
+static int mpam_disable_msc_ecr(void *_msc)
+{
+	struct mpam_msc *msc = _msc;
+
+	__mpam_write_reg(msc, MPAMF_ECR, 0);
+
+	return 0;
+}
+
+static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc)
+{
+	u64 reg;
+	u16 partid;
+	u8 errcode, pmg, ris;
+
+	if (WARN_ON_ONCE(!msc) ||
+	    WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &msc->accessibility)))
+		return IRQ_NONE;
+
+	reg = mpam_msc_read_esr(msc);
+
+	errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg);
+	if (!errcode)
+		return IRQ_NONE;
+
+	/* Clear level triggered irq */
+	mpam_msc_zero_esr(msc);
+
+	partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg);
+	pmg = FIELD_GET(MPAMF_ESR_PMG, reg);
+	ris = FIELD_GET(MPAMF_ESR_RIS, reg);
+
+	pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n",
+			   msc->id, mpam_errcode_names[errcode], partid, pmg,
+			   ris);
+
+	/* Disable this interrupt. */
+	mpam_disable_msc_ecr(msc);
+
+	/*
+	 * Schedule the teardown work. Don't use a threaded IRQ as we can't
+	 * unregister the interrupt from the threaded part of the handler.
+	 */
+	mpam_disable_reason = "hardware error interrupt";
+	schedule_work(&mpam_broken_work);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mpam_ppi_handler(int irq, void *dev_id)
+{
+	struct mpam_msc *msc = *(struct mpam_msc **)dev_id;
+
+	return __mpam_irq_handler(irq, msc);
+}
+
+static irqreturn_t mpam_spi_handler(int irq, void *dev_id)
+{
+	struct mpam_msc *msc = dev_id;
+
+	return __mpam_irq_handler(irq, msc);
+}
+
+static int mpam_register_irqs(void)
+{
+	int err, irq;
+	struct mpam_msc *msc;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		irq = platform_get_irq_byname_optional(msc->pdev, "error");
+		if (irq <= 0)
+			continue;
+
+		/* The MPAM spec says the interrupt can be SPI, PPI or LPI */
+		/* We anticipate sharing the interrupt with other MSCs */
+		if (irq_is_percpu(irq)) {
+			err = request_percpu_irq(irq, &mpam_ppi_handler,
+						 "mpam:msc:error",
+						 msc->error_dev_id);
+			if (err)
+				return err;
+
+			msc->reenable_error_ppi = irq;
+			smp_call_function_many(&msc->accessibility,
+					       &_enable_percpu_irq, &irq,
+					       true);
+		} else {
+			err = devm_request_irq(&msc->pdev->dev,irq,
+					       &mpam_spi_handler, IRQF_SHARED,
+					       "mpam:msc:error", msc);
+			if (err)
+				return err;
+		}
+
+		set_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags);
+		mpam_touch_msc(msc, mpam_enable_msc_ecr, msc);
+		set_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags);
+	}
+
+	return 0;
+}
+
+static void mpam_unregister_irqs(void)
+{
+	int irq, idx;
+	struct mpam_msc *msc;
+
+	cpus_read_lock();
+	/* take the lock as free_irq() can sleep */
+	idx = srcu_read_lock(&mpam_srcu);
+	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		irq = platform_get_irq_byname_optional(msc->pdev, "error");
+		if (irq <= 0)
+			continue;
+
+		if (test_and_clear_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags))
+			mpam_touch_msc(msc, mpam_disable_msc_ecr, msc);
+
+		if (test_and_clear_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags)) {
+			if (irq_is_percpu(irq)) {
+				msc->reenable_error_ppi = 0;
+				free_percpu_irq(irq, msc->error_dev_id);
+			} else {
+				devm_free_irq(&msc->pdev->dev, irq, msc);
+			}
+		}
+	}
+	srcu_read_unlock(&mpam_srcu, idx);
+	cpus_read_unlock();
+}
+
 static void mpam_enable_once(void)
 {
-	mutex_lock(&mpam_list_lock);
-	mpam_enable_merge_features(&mpam_classes);
-	mutex_unlock(&mpam_list_lock);
+	int err;
 
 	/*
 	 * Once the cpuhp callbacks have been changed, mpam_partid_max can no
@@ -1332,6 +1580,27 @@ static void mpam_enable_once(void)
 	partid_max_published = true;
 	spin_unlock(&partid_max_lock);
 
+	/*
+	 * If all the MSC have been probed, enabling the IRQs happens next.
+	 * That involves cross-calling to a CPU that can reach the MSC, and
+	 * the locks must be taken in this order:
+	 */
+	cpus_read_lock();
+	mutex_lock(&mpam_list_lock);
+	mpam_enable_merge_features(&mpam_classes);
+
+	err = mpam_register_irqs();
+	if (err)
+		pr_warn("Failed to register irqs: %d\n", err);
+
+	mutex_unlock(&mpam_list_lock);
+	cpus_read_unlock();
+
+	if (err) {
+		schedule_work(&mpam_broken_work);
+		return;
+	}
+
 	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
 
 	printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n",
@@ -1397,6 +1666,8 @@ void mpam_disable(struct work_struct *ignored)
 	}
 	mutex_unlock(&mpam_cpuhp_state_lock);
 
+	mpam_unregister_irqs();
+
 	idx = srcu_read_lock(&mpam_srcu);
 	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
 				 srcu_read_lock_held(&mpam_srcu))
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index 6e047fbd3512..f04a9ef189cf 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -32,6 +32,10 @@ struct mpam_garbage {
 	struct platform_device	*pdev;
 };
 
+/* Bit positions for error_irq_flags */
+#define	MPAM_ERROR_IRQ_REQUESTED  0
+#define	MPAM_ERROR_IRQ_HW_ENABLED 1
+
 struct mpam_msc {
 	/* member of mpam_all_msc */
 	struct list_head        all_msc_list;
@@ -46,6 +50,11 @@ struct mpam_msc {
 	struct pcc_mbox_chan	*pcc_chan;
 	u32			nrdy_usec;
 	cpumask_t		accessibility;
+	bool			has_extd_esr;
+
+	int				reenable_error_ppi;
+	struct mpam_msc * __percpu	*error_dev_id;
+
 	atomic_t		online_refs;
 
 	/*
@@ -54,6 +63,7 @@ struct mpam_msc {
 	 */
 	struct mutex		probe_lock;
 	bool			probed;
+	unsigned long		error_irq_flags;
 	u16			partid_max;
 	u8			pmg_max;
 	unsigned long		ris_idxs;
-- 
2.39.5
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by Dave Martin 4 months, 3 weeks ago
Hi James,

On Wed, Sep 10, 2025 at 08:42:58PM +0000, James Morse wrote:
> Register and enable error IRQs. All the MPAM error interrupts indicate a
> software bug, e.g. out of range partid. If the error interrupt is ever
> signalled, attempt to disable MPAM.
> 
> Only the irq handler accesses the ESR register, so no locking is needed.

Nit: MPAMF_ESR?  (Casual readers may confuse it with ESR_ELx.
Formally, there is no MPAM "ESR" register, though people familiar with
the spec will of course know what you're referring to.)

> The work to disable MPAM after an error needs to happen at process
> context as it takes mutex. It also unregisters the interrupts, meaning
> it can't be done from the threaded part of a threaded interrupt.
> Instead, mpam_disable() gets scheduled.
> 
> Enabling the IRQs in the MSC may involve cross calling to a CPU that
> can access the MSC.
> 
> Once the IRQ is requested, the mpam_disable() path can be called
> asynchronously, which will walk structures sized by max_partid. Ensure
> this size is fixed before the interrupt is requested.
> 
> CC: Rohit Mathew <rohit.mathew@arm.com>
> Tested-by: Rohit Mathew <rohit.mathew@arm.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since v1:
>  * Made mpam_unregister_irqs() safe to race with itself.
>  * Removed threaded interrupts.
>  * Schedule mpam_disable() from cpuhp callback in the case of an error.
>  * Added mpam_disable_reason.
>  * Use alloc_percpu()
> 
> Changes since RFC:
>  * Use guard marco when walking srcu list.
>  * Use INTEN macro for enabling interrupts.
>  * Move partid_max_published up earlier in mpam_enable_once().
> ---
>  drivers/resctrl/mpam_devices.c  | 277 +++++++++++++++++++++++++++++++-
>  drivers/resctrl/mpam_internal.h |  10 ++
>  2 files changed, 284 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
> index a9d3c4b09976..e7e4afc1ea95 100644
> --- a/drivers/resctrl/mpam_devices.c
> +++ b/drivers/resctrl/mpam_devices.c
> @@ -14,6 +14,9 @@
>  #include <linux/device.h>
>  #include <linux/errno.h>
>  #include <linux/gfp.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/irqdesc.h>
>  #include <linux/list.h>
>  #include <linux/lockdep.h>
>  #include <linux/mutex.h>
> @@ -166,6 +169,24 @@ static u64 mpam_msc_read_idr(struct mpam_msc *msc)
>  	return (idr_high << 32) | idr_low;
>  }
>  
> +static void mpam_msc_zero_esr(struct mpam_msc *msc)

Nit: Maybe clear_esr?  (The fact that setting the ERRCODE and OVRWR
fields to zero clears the interrupt and prepares for unambiguous
reporting of the next error is more of an implementation detail.
It doesn't matter what the rest of the register is set to.)

> +{
> +	__mpam_write_reg(msc, MPAMF_ESR, 0);
> +	if (msc->has_extd_esr)

This deasserts the interrupt (if level-sensitive) and enables the MSC
to report further errors.  If we are unlucky and error occurs now,
won't we splat the newly HW-generated RIS field by:

> +		__mpam_write_reg(msc, MPAMF_ESR + 4, 0);

...?  If so, we will diagnose the wrong RIS when we pump the new error
from MPAMF_ESR.  I think the correct interpretation of the spec may be
that:

 a) software should treat fields in MPAMF_ESR[63:32] as vaild only if
    ERRCODE is nonzero, and

 b) software should never write to MPAMF_ESR[63:32] while ERRCODE is
    zero.

Does this look right?  Should the fields be cleared in the opposite
order?

Or alternatively, is it actually necessary to clear MPAMF_ESR[63:32]
at all?

(The spec seems a bit vague on what software is supposed to do with
this register to ensure correctness...)

> +}
> +
> +static u64 mpam_msc_read_esr(struct mpam_msc *msc)
> +{
> +	u64 esr_high = 0, esr_low;
> +
> +	esr_low = __mpam_read_reg(msc, MPAMF_ESR);
> +	if (msc->has_extd_esr)
> +		esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4);
> +
> +	return (esr_high << 32) | esr_low;
> +}

[...]

> @@ -895,6 +920,13 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online)
>  	}
>  }
>  
> +static void _enable_percpu_irq(void *_irq)
> +{
> +	int *irq = _irq;
> +
> +	enable_percpu_irq(*irq, IRQ_TYPE_NONE);

Can the type vary?  (Maybe this makes no sense on GIC-based systems --
IRQ_TYPE_NONE (or "0") seems overwhelmingly common.)

(Just my lack of familiarity takling, here.)

[...]

> +static int __setup_ppi(struct mpam_msc *msc)
> +{
> +	int cpu;
> +	struct device *dev = &msc->pdev->dev;
> +
> +	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
> +	if (!msc->error_dev_id)
> +		return -ENOMEM;
> +
> +	for_each_cpu(cpu, &msc->accessibility) {
> +		struct mpam_msc *empty = *per_cpu_ptr(msc->error_dev_id, cpu);
> +
> +		if (empty) {
> +			dev_err_once(dev, "MSC shares PPI with %s!\n",
> +				     dev_name(&empty->pdev->dev));
> +			return -EBUSY;
> +		}
> +		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
> +	}

How are PPIs supposed to work?

An individual MSC that is affine to multiple CPUs has no way to
distinguish which CPU an error relates to, and no CPU-specific (or even
RIS-specific) ESR.

So, won't such an interrupt be pointlessly take the interrupt on all
CPUs, which would all fight over the reported event?

Have you encountered any platforms wired up this way?  The spec
recommends not to do this, but does not provide any rationale...

The spec only mentions PPIs in the context of being affine to a single
CPU (PE).  It's not clear to me that any other use of PPIs makes
sense (?)

If we really have to cope with this, maybe it would make sense to pick
a single CPU in the affinity set (though we might have to move it
around if the unlucky CPU is offlined).

[...]

> +static char *mpam_errcode_names[16] = {
> +	[0] = "No error",
> +	[1] = "PARTID_SEL_Range",
> +	[2] = "Req_PARTID_Range",
> +	[3] = "MSMONCFG_ID_RANGE",
> +	[4] = "Req_PMG_Range",
> +	[5] = "Monitor_Range",
> +	[6] = "intPARTID_Range",
> +	[7] = "Unexpected_INTERNAL",
> +	[8] = "Undefined_RIS_PART_SEL",
> +	[9] = "RIS_No_Control",
> +	[10] = "Undefined_RIS_MON_SEL",
> +	[11] = "RIS_No_Monitor",
> +	[12 ... 15] = "Reserved"
> +};
> +
> +static int mpam_enable_msc_ecr(void *_msc)
> +{
> +	struct mpam_msc *msc = _msc;
> +
> +	__mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN);
> +
> +	return 0;
> +}

This could also be a switch () { case 0: return "foo";
case 1: return "bar"; ... }, without the explicit table.  This would
avoid having to think about the ERRCODE field growing.  (There are some
RES0 bits looming over it.)

(This also tends to avoid the extra pointer table in .rodata, which
might be of interest if this were a hot path.)

[...]

(Review truncated -- that's the comments I had so far on the previous
series.)

Cheers
---Dave
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by James Morse 4 months ago
Hi Dave,

On 12/09/2025 16:22, Dave Martin wrote:
> On Wed, Sep 10, 2025 at 08:42:58PM +0000, James Morse wrote:
>> Register and enable error IRQs. All the MPAM error interrupts indicate a
>> software bug, e.g. out of range partid. If the error interrupt is ever
>> signalled, attempt to disable MPAM.
>>
>> Only the irq handler accesses the ESR register, so no locking is needed.
> 
> Nit: MPAMF_ESR?  (Casual readers may confuse it with ESR_ELx.

Sure, fixed.


> Formally, there is no MPAM "ESR" register, though people familiar with
> the spec will of course know what you're referring to.)
> 
>> The work to disable MPAM after an error needs to happen at process
>> context as it takes mutex. It also unregisters the interrupts, meaning
>> it can't be done from the threaded part of a threaded interrupt.
>> Instead, mpam_disable() gets scheduled.
>>
>> Enabling the IRQs in the MSC may involve cross calling to a CPU that
>> can access the MSC.
>>
>> Once the IRQ is requested, the mpam_disable() path can be called
>> asynchronously, which will walk structures sized by max_partid. Ensure
>> this size is fixed before the interrupt is requested.

>> diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
>> index a9d3c4b09976..e7e4afc1ea95 100644
>> --- a/drivers/resctrl/mpam_devices.c
>> +++ b/drivers/resctrl/mpam_devices.c

>> @@ -166,6 +169,24 @@ static u64 mpam_msc_read_idr(struct mpam_msc *msc)
>>  	return (idr_high << 32) | idr_low;
>>  }
>>  
>> +static void mpam_msc_zero_esr(struct mpam_msc *msc)

> Nit: Maybe clear_esr?  (The fact that setting the ERRCODE and OVRWR
> fields to zero clears the interrupt and prepares for unambiguous
> reporting of the next error is more of an implementation detail.
> It doesn't matter what the rest of the register is set to.)

If you think that name is clearer - sure,


>> +{
>> +	__mpam_write_reg(msc, MPAMF_ESR, 0);
>> +	if (msc->has_extd_esr)
> 
> This deasserts the interrupt (if level-sensitive) and enables the MSC
> to report further errors.  If we are unlucky and error occurs now,
> won't we splat the newly HW-generated RIS field by:
> 
>> +		__mpam_write_reg(msc, MPAMF_ESR + 4, 0);
> 
> ...?  If so, we will diagnose the wrong RIS when we pump the new error
> from MPAMF_ESR.  I think the correct interpretation of the spec may be
> that:
> 
>  a) software should treat fields in MPAMF_ESR[63:32] as vaild only if
>     ERRCODE is nonzero, and
> 
>  b) software should never write to MPAMF_ESR[63:32] while ERRCODE is
>     zero.
> 
> Does this look right?  Should the fields be cleared in the opposite
> order?
> 
> Or alternatively, is it actually necessary to clear MPAMF_ESR[63:32]
> at all?
> 
> (The spec seems a bit vague on what software is supposed to do with
> this register to ensure correctness...)

Yeah - I don't think there was much intention here beyond making the RIS
available for the first error. As none of these errors can really be handled,
I don't think a second error value matters too much.

I've reordered it as you suggest, and added a block comment to explain the
problem:
-----%<-----
	u64 esr_low = __mpam_read_reg(msc, MPAMF_ESR);
	if (!esr_low)
		return;

	/*
	 * Clearing the high/low bits of MPAMF_ESR can not be atomic.
	 * Clear the top half first, so that the pending error bits in the
	 * lower half prevent hardware from updating either half of the
	 * register.
	 */
	if (msc->has_extd_esr)
		__mpam_write_reg(msc, MPAMF_ESR + 4, 0);
	__mpam_write_reg(msc, MPAMF_ESR, 0);
-----%<-----

We should probably go bother the architects to find out if we can modify
the high bits like this safely.



>> +}

>> @@ -895,6 +920,13 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online)
>>  	}
>>  }
>>  
>> +static void _enable_percpu_irq(void *_irq)
>> +{
>> +	int *irq = _irq;
>> +
>> +	enable_percpu_irq(*irq, IRQ_TYPE_NONE);

> Can the type vary?  (Maybe this makes no sense on GIC-based systems --
> IRQ_TYPE_NONE (or "0") seems overwhelmingly common.)
> 
> (Just my lack of familiarity takling, here.)

PPI can be edge or level - but the irqchip doesn't need to know at this point, and
specifying NONE tells it to use what it already knows. The irqchip already got told
what firmware table said when the interrupt was registered. I believe the GIC knows
its a PPI from the intid, they live in a magic range.


> [...]
> 
>> +static int __setup_ppi(struct mpam_msc *msc)
>> +{
>> +	int cpu;
>> +	struct device *dev = &msc->pdev->dev;
>> +
>> +	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
>> +	if (!msc->error_dev_id)
>> +		return -ENOMEM;
>> +
>> +	for_each_cpu(cpu, &msc->accessibility) {
>> +		struct mpam_msc *empty = *per_cpu_ptr(msc->error_dev_id, cpu);
>> +
>> +		if (empty) {
>> +			dev_err_once(dev, "MSC shares PPI with %s!\n",
>> +				     dev_name(&empty->pdev->dev));
>> +			return -EBUSY;
>> +		}
>> +		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
>> +	}

> How are PPIs supposed to work?

You take the interrupt on the corresponding CPU, and the irqchip gives you the
corresponding percpu pointer. That is what is being setup here.


> An individual MSC that is affine to multiple CPUs has no way to
> distinguish which CPU an error relates to, and no CPU-specific (or even
> RIS-specific) ESR.

For deeper caches this is certainly a problem. But for things close in to the CPU, they
may well know which CPU this transaction came from. The MSC may even be part of the
CPU.


> So, won't such an interrupt be pointlessly take the interrupt on all
> CPUs, which would all fight over the reported event?

It ought to be only one, but nothing requires this.


> Have you encountered any platforms wired up this way?  The spec
> recommends not to do this, but does not provide any rationale...
> 
> The spec only mentions PPIs in the context of being affine to a single
> CPU (PE).

> It's not clear to me that any other use of PPIs makes
> sense (?)

The PPI problem also exists the other way round. If your MSC is not globally accessible,
you can't wire its interrupts up to an SPI, otherwise linux can route the interrupt to
CPUs that can't access the MSC.

The only tool you have to get out of this is to use a PPI.

This came up multiple times with RAS when the caches signalled errors and it really needed
to go to the local CPUs, not a random CPU in a remote package.

It's my assumption that folk will build platforms the same shape, and reach for PPI again.


> If we really have to cope with this, maybe it would make sense to pick
> a single CPU in the affinity set (though we might have to move it
> around if the unlucky CPU is offlined).

Unfortunately PPI are a totally separate set of wiring with a reserved intid range (or
two!) and their own registers in the GICR. It isn't possible to pretend they're a regular
interrupt taken on a particular CPU.

A choice that can be made here is to assume no-one will ever(?) use these, and drop
support. I don't know of a platform that is using PPI - but I can't say no-one is.


> 
> [...]
> 
>> +static char *mpam_errcode_names[16] = {
>> +	[0] = "No error",
>> +	[1] = "PARTID_SEL_Range",
>> +	[2] = "Req_PARTID_Range",
>> +	[3] = "MSMONCFG_ID_RANGE",
>> +	[4] = "Req_PMG_Range",
>> +	[5] = "Monitor_Range",
>> +	[6] = "intPARTID_Range",
>> +	[7] = "Unexpected_INTERNAL",
>> +	[8] = "Undefined_RIS_PART_SEL",
>> +	[9] = "RIS_No_Control",
>> +	[10] = "Undefined_RIS_MON_SEL",
>> +	[11] = "RIS_No_Monitor",
>> +	[12 ... 15] = "Reserved"
>> +};
>> +
>> +static int mpam_enable_msc_ecr(void *_msc)
>> +{
>> +	struct mpam_msc *msc = _msc;
>> +
>> +	__mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN);
>> +
>> +	return 0;
>> +}
> 
> This could also be a switch () { case 0: return "foo";
> case 1: return "bar"; ... }, without the explicit table.  This would
> avoid having to think about the ERRCODE field growing.  (There are some
> RES0 bits looming over it.)
> 
> (This also tends to avoid the extra pointer table in .rodata, which
> might be of interest if this were a hot path.)

Given they added new fields to the end of the list, I'm not worried about it becoming sparse.


Thanks,

James
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by Ben Horgan 4 months, 3 weeks ago
Hi James,

On 9/10/25 21:42, James Morse wrote:
> Register and enable error IRQs. All the MPAM error interrupts indicate a
> software bug, e.g. out of range partid. If the error interrupt is ever
> signalled, attempt to disable MPAM.
> 
> Only the irq handler accesses the ESR register, so no locking is needed.
> The work to disable MPAM after an error needs to happen at process
> context as it takes mutex. It also unregisters the interrupts, meaning
> it can't be done from the threaded part of a threaded interrupt.
> Instead, mpam_disable() gets scheduled.
> 
> Enabling the IRQs in the MSC may involve cross calling to a CPU that
> can access the MSC.
> 
> Once the IRQ is requested, the mpam_disable() path can be called
> asynchronously, which will walk structures sized by max_partid. Ensure
> this size is fixed before the interrupt is requested.
> 
> CC: Rohit Mathew <rohit.mathew@arm.com>
> Tested-by: Rohit Mathew <rohit.mathew@arm.com>
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> Changes since v1:
>  * Made mpam_unregister_irqs() safe to race with itself.
>  * Removed threaded interrupts.
>  * Schedule mpam_disable() from cpuhp callback in the case of an error.
>  * Added mpam_disable_reason.
>  * Use alloc_percpu()
> 
> Changes since RFC:
>  * Use guard marco when walking srcu list.
>  * Use INTEN macro for enabling interrupts.
>  * Move partid_max_published up earlier in mpam_enable_once().
> ---
>  drivers/resctrl/mpam_devices.c  | 277 +++++++++++++++++++++++++++++++-
>  drivers/resctrl/mpam_internal.h |  10 ++
>  2 files changed, 284 insertions(+), 3 deletions(-)
> 

>  
> +static int __setup_ppi(struct mpam_msc *msc)
> +{
> +	int cpu;
> +	struct device *dev = &msc->pdev->dev;
> +
> +	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
> +	if (!msc->error_dev_id)
> +		return -ENOMEM;
> +
> +	for_each_cpu(cpu, &msc->accessibility) {
> +		struct mpam_msc *empty = *per_cpu_ptr(msc->error_dev_id, cpu);
> +
> +		if (empty) {

I'm confused about how this if conditioned can be satisfied. Isn't the
alloc clearing msc->error_dev_id for each cpu and then it's only getting
set for each cpu later in the iteration.

> +			dev_err_once(dev, "MSC shares PPI with %s!\n",
> +				     dev_name(&empty->pdev->dev));
> +			return -EBUSY;
> +		}
> +		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
> +	}
> +
> +	return 0;
> +}
> +
> +static int mpam_msc_setup_error_irq(struct mpam_msc *msc)
> +{
> +	int irq;
> +
> +	irq = platform_get_irq_byname_optional(msc->pdev, "error");
> +	if (irq <= 0)
> +		return 0;
> +
> +	/* Allocate and initialise the percpu device pointer for PPI */
> +	if (irq_is_percpu(irq))
> +		return __setup_ppi(msc);
> +
> +	/* sanity check: shared interrupts can be routed anywhere? */
> +	if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) {
> +		pr_err_once("msc:%u is a private resource with a shared error interrupt",
> +			    msc->id);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * An MSC can control traffic from a set of CPUs, but may only be accessible
>   * from a (hopefully wider) set of CPUs. The common reason for this is power
> @@ -1060,6 +1143,10 @@ static int mpam_msc_drv_probe(struct platform_device *pdev)
>  			break;
>  		}
>  
> +		err = mpam_msc_setup_error_irq(msc);
> +		if (err)
> +			break;
> +
>  		if (device_property_read_u32(&pdev->dev, "pcc-channel",
>  					     &msc->pcc_subspace_id))
>  			msc->iface = MPAM_IFACE_MMIO;
> @@ -1318,11 +1405,172 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list)
>  	}
>  }
>  
> +static char *mpam_errcode_names[16] = {
> +	[0] = "No error",
> +	[1] = "PARTID_SEL_Range",
> +	[2] = "Req_PARTID_Range",
> +	[3] = "MSMONCFG_ID_RANGE",
> +	[4] = "Req_PMG_Range",
> +	[5] = "Monitor_Range",
> +	[6] = "intPARTID_Range",
> +	[7] = "Unexpected_INTERNAL",
> +	[8] = "Undefined_RIS_PART_SEL",
> +	[9] = "RIS_No_Control",
> +	[10] = "Undefined_RIS_MON_SEL",
> +	[11] = "RIS_No_Monitor",
> +	[12 ... 15] = "Reserved"
> +};
> +
> +static int mpam_enable_msc_ecr(void *_msc)
> +{
> +	struct mpam_msc *msc = _msc;
> +
> +	__mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN);
> +
> +	return 0;
> +}
> +
> +/* This can run in mpam_disable(), and the interrupt handler on the same CPU */
> +static int mpam_disable_msc_ecr(void *_msc)
> +{
> +	struct mpam_msc *msc = _msc;
> +
> +	__mpam_write_reg(msc, MPAMF_ECR, 0);
> +
> +	return 0;
> +}
> +
> +static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc)
> +{
> +	u64 reg;
> +	u16 partid;
> +	u8 errcode, pmg, ris;
> +
> +	if (WARN_ON_ONCE(!msc) ||
> +	    WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
> +					   &msc->accessibility)))
> +		return IRQ_NONE;
> +
> +	reg = mpam_msc_read_esr(msc);
> +
> +	errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg);
> +	if (!errcode)
> +		return IRQ_NONE;
> +
> +	/* Clear level triggered irq */
> +	mpam_msc_zero_esr(msc);
> +
> +	partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg);
> +	pmg = FIELD_GET(MPAMF_ESR_PMG, reg);
> +	ris = FIELD_GET(MPAMF_ESR_RIS, reg);
> +
> +	pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n",
> +			   msc->id, mpam_errcode_names[errcode], partid, pmg,
> +			   ris);
> +
> +	/* Disable this interrupt. */
> +	mpam_disable_msc_ecr(msc);
> +
> +	/*
> +	 * Schedule the teardown work. Don't use a threaded IRQ as we can't
> +	 * unregister the interrupt from the threaded part of the handler.
> +	 */
> +	mpam_disable_reason = "hardware error interrupt";
> +	schedule_work(&mpam_broken_work);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t mpam_ppi_handler(int irq, void *dev_id)
> +{
> +	struct mpam_msc *msc = *(struct mpam_msc **)dev_id;
> +
> +	return __mpam_irq_handler(irq, msc);
> +}
> +
> +static irqreturn_t mpam_spi_handler(int irq, void *dev_id)
> +{
> +	struct mpam_msc *msc = dev_id;
> +
> +	return __mpam_irq_handler(irq, msc);
> +}
> +
> +static int mpam_register_irqs(void)
> +{
> +	int err, irq;
> +	struct mpam_msc *msc;
> +
> +	lockdep_assert_cpus_held();
> +
> +	guard(srcu)(&mpam_srcu);
> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
> +				 srcu_read_lock_held(&mpam_srcu)) {
> +		irq = platform_get_irq_byname_optional(msc->pdev, "error");
> +		if (irq <= 0)
> +			continue;
> +
> +		/* The MPAM spec says the interrupt can be SPI, PPI or LPI */
> +		/* We anticipate sharing the interrupt with other MSCs */
> +		if (irq_is_percpu(irq)) {
> +			err = request_percpu_irq(irq, &mpam_ppi_handler,
> +						 "mpam:msc:error",
> +						 msc->error_dev_id);
> +			if (err)
> +				return err;
> +
> +			msc->reenable_error_ppi = irq;
> +			smp_call_function_many(&msc->accessibility,
> +					       &_enable_percpu_irq, &irq,
> +					       true);
> +		} else {
> +			err = devm_request_irq(&msc->pdev->dev,irq,
> +					       &mpam_spi_handler, IRQF_SHARED,
> +					       "mpam:msc:error", msc);
> +			if (err)
> +				return err;
> +		}
> +
> +		set_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags);
> +		mpam_touch_msc(msc, mpam_enable_msc_ecr, msc);
> +		set_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags);
> +	}
> +
> +	return 0;
> +}
> +
> +static void mpam_unregister_irqs(void)
> +{
> +	int irq, idx;
> +	struct mpam_msc *msc;
> +
> +	cpus_read_lock();
> +	/* take the lock as free_irq() can sleep */
> +	idx = srcu_read_lock(&mpam_srcu);
> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
> +				 srcu_read_lock_held(&mpam_srcu)) {
> +		irq = platform_get_irq_byname_optional(msc->pdev, "error");
> +		if (irq <= 0)
> +			continue;
> +
> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags))
> +			mpam_touch_msc(msc, mpam_disable_msc_ecr, msc);
> +
> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags)) {
> +			if (irq_is_percpu(irq)) {
> +				msc->reenable_error_ppi = 0;
> +				free_percpu_irq(irq, msc->error_dev_id);
> +			} else {
> +				devm_free_irq(&msc->pdev->dev, irq, msc);
> +			}
> +		}
> +	}
> +	srcu_read_unlock(&mpam_srcu, idx);
> +	cpus_read_unlock();
> +}
> +
>  static void mpam_enable_once(void)
>  {
> -	mutex_lock(&mpam_list_lock);
> -	mpam_enable_merge_features(&mpam_classes);
> -	mutex_unlock(&mpam_list_lock);
> +	int err;
>  
>  	/*
>  	 * Once the cpuhp callbacks have been changed, mpam_partid_max can no
> @@ -1332,6 +1580,27 @@ static void mpam_enable_once(void)
>  	partid_max_published = true;
>  	spin_unlock(&partid_max_lock);
>  
> +	/*
> +	 * If all the MSC have been probed, enabling the IRQs happens next.
> +	 * That involves cross-calling to a CPU that can reach the MSC, and
> +	 * the locks must be taken in this order:
> +	 */
> +	cpus_read_lock();
> +	mutex_lock(&mpam_list_lock);
> +	mpam_enable_merge_features(&mpam_classes);
> +
> +	err = mpam_register_irqs();
> +	if (err)
> +		pr_warn("Failed to register irqs: %d\n", err);
> +
> +	mutex_unlock(&mpam_list_lock);
> +	cpus_read_unlock();
> +
> +	if (err) {
> +		schedule_work(&mpam_broken_work);
> +		return;
> +	}
> +
>  	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline);
>  
>  	printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n",
> @@ -1397,6 +1666,8 @@ void mpam_disable(struct work_struct *ignored)
>  	}
>  	mutex_unlock(&mpam_cpuhp_state_lock);
>  
> +	mpam_unregister_irqs();
> +
>  	idx = srcu_read_lock(&mpam_srcu);
>  	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
>  				 srcu_read_lock_held(&mpam_srcu))
> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> index 6e047fbd3512..f04a9ef189cf 100644
> --- a/drivers/resctrl/mpam_internal.h
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -32,6 +32,10 @@ struct mpam_garbage {
>  	struct platform_device	*pdev;
>  };
>  
> +/* Bit positions for error_irq_flags */
> +#define	MPAM_ERROR_IRQ_REQUESTED  0
> +#define	MPAM_ERROR_IRQ_HW_ENABLED 1
> +
>  struct mpam_msc {
>  	/* member of mpam_all_msc */
>  	struct list_head        all_msc_list;
> @@ -46,6 +50,11 @@ struct mpam_msc {
>  	struct pcc_mbox_chan	*pcc_chan;
>  	u32			nrdy_usec;
>  	cpumask_t		accessibility;
> +	bool			has_extd_esr;
> +
> +	int				reenable_error_ppi;
> +	struct mpam_msc * __percpu	*error_dev_id;
> +
>  	atomic_t		online_refs;
>  
>  	/*
> @@ -54,6 +63,7 @@ struct mpam_msc {
>  	 */
>  	struct mutex		probe_lock;
>  	bool			probed;
> +	unsigned long		error_irq_flags;
>  	u16			partid_max;
>  	u8			pmg_max;
>  	unsigned long		ris_idxs;


Thanks,

Ben
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by James Morse 4 months, 1 week ago
Hi Ben,

On 12/09/2025 15:40, Ben Horgan wrote:
> On 9/10/25 21:42, James Morse wrote:
>> Register and enable error IRQs. All the MPAM error interrupts indicate a
>> software bug, e.g. out of range partid. If the error interrupt is ever
>> signalled, attempt to disable MPAM.
>>
>> Only the irq handler accesses the ESR register, so no locking is needed.
>> The work to disable MPAM after an error needs to happen at process
>> context as it takes mutex. It also unregisters the interrupts, meaning
>> it can't be done from the threaded part of a threaded interrupt.
>> Instead, mpam_disable() gets scheduled.
>>
>> Enabling the IRQs in the MSC may involve cross calling to a CPU that
>> can access the MSC.
>>
>> Once the IRQ is requested, the mpam_disable() path can be called
>> asynchronously, which will walk structures sized by max_partid. Ensure
>> this size is fixed before the interrupt is requested.


>> +static int __setup_ppi(struct mpam_msc *msc)
>> +{
>> +	int cpu;
>> +	struct device *dev = &msc->pdev->dev;
>> +
>> +	msc->error_dev_id = alloc_percpu(struct mpam_msc *);
>> +	if (!msc->error_dev_id)
>> +		return -ENOMEM;
>> +
>> +	for_each_cpu(cpu, &msc->accessibility) {
>> +		struct mpam_msc *empty = *per_cpu_ptr(msc->error_dev_id, cpu);
>> +
>> +		if (empty) {
> 
> I'm confused about how this if conditioned can be satisfied. Isn't the
> alloc clearing msc->error_dev_id for each cpu and then it's only getting
> set for each cpu later in the iteration.

Yes, you're right.

I think this was part of the support for PPI partitions, where multiple partitions would
get set up here. This was a sanity check that they didn't overlap...


I've ripped that out.


>> +			dev_err_once(dev, "MSC shares PPI with %s!\n",
>> +				     dev_name(&empty->pdev->dev));
>> +			return -EBUSY;
>> +		}
>> +		*per_cpu_ptr(msc->error_dev_id, cpu) = msc;
>> +	}
>> +
>> +	return 0;
>> +}

Thanks,

James
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by Jonathan Cameron 4 months, 3 weeks ago
On Wed, 10 Sep 2025 20:42:58 +0000
James Morse <james.morse@arm.com> wrote:

> Register and enable error IRQs. All the MPAM error interrupts indicate a
> software bug, e.g. out of range partid. If the error interrupt is ever
> signalled, attempt to disable MPAM.
> 
> Only the irq handler accesses the ESR register, so no locking is needed.
> The work to disable MPAM after an error needs to happen at process
> context as it takes mutex. It also unregisters the interrupts, meaning
> it can't be done from the threaded part of a threaded interrupt.
> Instead, mpam_disable() gets scheduled.
> 
> Enabling the IRQs in the MSC may involve cross calling to a CPU that
> can access the MSC.
> 
> Once the IRQ is requested, the mpam_disable() path can be called
> asynchronously, which will walk structures sized by max_partid. Ensure
> this size is fixed before the interrupt is requested.
> 
> CC: Rohit Mathew <rohit.mathew@arm.com>
> Tested-by: Rohit Mathew <rohit.mathew@arm.com>
> Signed-off-by: James Morse <james.morse@arm.com>
A few comments inline.


> @@ -1318,11 +1405,172 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list)
>  	}
>  }
>  
> +static char *mpam_errcode_names[16] = {
> +	[0] = "No error",

I think you had a bunch of defines for these in an earlier patch.  Can we use
that to index here instead of [0] etc. 

> +	[1] = "PARTID_SEL_Range",
> +	[2] = "Req_PARTID_Range",
> +	[3] = "MSMONCFG_ID_RANGE",
> +	[4] = "Req_PMG_Range",
> +	[5] = "Monitor_Range",
> +	[6] = "intPARTID_Range",
> +	[7] = "Unexpected_INTERNAL",
> +	[8] = "Undefined_RIS_PART_SEL",
> +	[9] = "RIS_No_Control",
> +	[10] = "Undefined_RIS_MON_SEL",
> +	[11] = "RIS_No_Monitor",
> +	[12 ... 15] = "Reserved"
> +};


> +static void mpam_unregister_irqs(void)
> +{
> +	int irq, idx;
> +	struct mpam_msc *msc;
> +
> +	cpus_read_lock();

	guard(cpus_read_lock)();
	guard(srcu)(&mpam_srcu);

> +	/* take the lock as free_irq() can sleep */
> +	idx = srcu_read_lock(&mpam_srcu);
> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
> +				 srcu_read_lock_held(&mpam_srcu)) {
> +		irq = platform_get_irq_byname_optional(msc->pdev, "error");
> +		if (irq <= 0)
> +			continue;
> +
> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags))
> +			mpam_touch_msc(msc, mpam_disable_msc_ecr, msc);
> +
> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags)) {
> +			if (irq_is_percpu(irq)) {
> +				msc->reenable_error_ppi = 0;
> +				free_percpu_irq(irq, msc->error_dev_id);
> +			} else {
> +				devm_free_irq(&msc->pdev->dev, irq, msc);
> +			}
> +		}
> +	}
> +	srcu_read_unlock(&mpam_srcu, idx);
> +	cpus_read_unlock();
> +}
> +
>  static void mpam_enable_once(void)
>  {
> -	mutex_lock(&mpam_list_lock);
> -	mpam_enable_merge_features(&mpam_classes);
> -	mutex_unlock(&mpam_list_lock);
> +	int err;
>  
>  	/*
>  	 * Once the cpuhp callbacks have been changed, mpam_partid_max can no
> @@ -1332,6 +1580,27 @@ static void mpam_enable_once(void)
>  	partid_max_published = true;
>  	spin_unlock(&partid_max_lock);
>  
> +	/*
> +	 * If all the MSC have been probed, enabling the IRQs happens next.
> +	 * That involves cross-calling to a CPU that can reach the MSC, and
> +	 * the locks must be taken in this order:
> +	 */
> +	cpus_read_lock();
> +	mutex_lock(&mpam_list_lock);
> +	mpam_enable_merge_features(&mpam_classes);
> +
> +	err = mpam_register_irqs();
> +	if (err)
> +		pr_warn("Failed to register irqs: %d\n", err);

Perhaps move the print into the if (err) below?

> +
> +	mutex_unlock(&mpam_list_lock);
> +	cpus_read_unlock();
> +
> +	if (err) {
> +		schedule_work(&mpam_broken_work);
> +		return;
> +	}

> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
> index 6e047fbd3512..f04a9ef189cf 100644
> --- a/drivers/resctrl/mpam_internal.h
> +++ b/drivers/resctrl/mpam_internal.h
> @@ -32,6 +32,10 @@ struct mpam_garbage {
>  	struct platform_device	*pdev;
>  };
>  
> +/* Bit positions for error_irq_flags */
> +#define	MPAM_ERROR_IRQ_REQUESTED  0
> +#define	MPAM_ERROR_IRQ_HW_ENABLED 1

If there aren't going to be load more of these (I've not really thought
about whether there might) then using a bitmap for these seems to add complexity
that we wouldn't see with 
bool error_irq_req;
bool error_irq_hw_enabled;


> +
>  struct mpam_msc {
>  	/* member of mpam_all_msc */
>  	struct list_head        all_msc_list;
> @@ -46,6 +50,11 @@ struct mpam_msc {
>  	struct pcc_mbox_chan	*pcc_chan;
>  	u32			nrdy_usec;
>  	cpumask_t		accessibility;
> +	bool			has_extd_esr;
> +
> +	int				reenable_error_ppi;
> +	struct mpam_msc * __percpu	*error_dev_id;
> +
>  	atomic_t		online_refs;
>  
>  	/*
> @@ -54,6 +63,7 @@ struct mpam_msc {
>  	 */
>  	struct mutex		probe_lock;
>  	bool			probed;
> +	unsigned long		error_irq_flags;
>  	u16			partid_max;
>  	u8			pmg_max;
>  	unsigned long		ris_idxs;
Re: [PATCH v2 18/29] arm_mpam: Register and enable IRQs
Posted by James Morse 4 months, 1 week ago
Hi Jonathan,

On 12/09/2025 13:12, Jonathan Cameron wrote:
> On Wed, 10 Sep 2025 20:42:58 +0000
> James Morse <james.morse@arm.com> wrote:
> 
>> Register and enable error IRQs. All the MPAM error interrupts indicate a
>> software bug, e.g. out of range partid. If the error interrupt is ever
>> signalled, attempt to disable MPAM.
>>
>> Only the irq handler accesses the ESR register, so no locking is needed.
>> The work to disable MPAM after an error needs to happen at process
>> context as it takes mutex. It also unregisters the interrupts, meaning
>> it can't be done from the threaded part of a threaded interrupt.
>> Instead, mpam_disable() gets scheduled.
>>
>> Enabling the IRQs in the MSC may involve cross calling to a CPU that
>> can access the MSC.
>>
>> Once the IRQ is requested, the mpam_disable() path can be called
>> asynchronously, which will walk structures sized by max_partid. Ensure
>> this size is fixed before the interrupt is requested.

>> @@ -1318,11 +1405,172 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list)
>>  	}
>>  }
>>  
>> +static char *mpam_errcode_names[16] = {
>> +	[0] = "No error",
> 
> I think you had a bunch of defines for these in an earlier patch.  Can we use
> that to index here instead of [0] etc. 

Sure,


>> +	[1] = "PARTID_SEL_Range",
>> +	[2] = "Req_PARTID_Range",
>> +	[3] = "MSMONCFG_ID_RANGE",
>> +	[4] = "Req_PMG_Range",
>> +	[5] = "Monitor_Range",
>> +	[6] = "intPARTID_Range",
>> +	[7] = "Unexpected_INTERNAL",
>> +	[8] = "Undefined_RIS_PART_SEL",
>> +	[9] = "RIS_No_Control",
>> +	[10] = "Undefined_RIS_MON_SEL",
>> +	[11] = "RIS_No_Monitor",
>> +	[12 ... 15] = "Reserved"
>> +};
> 
> 
>> +static void mpam_unregister_irqs(void)
>> +{
>> +	int irq, idx;
>> +	struct mpam_msc *msc;
>> +
>> +	cpus_read_lock();
> 
> 	guard(cpus_read_lock)();
> 	guard(srcu)(&mpam_srcu);

Sure, looks like I didn't realise there was a cpus_read_lock version of this when I went
looking for places to add this.


>> +	/* take the lock as free_irq() can sleep */

The comment gets dropped as this mattered for an earlier locking scheme. (but free_irq()
can still sleep)


>> +	idx = srcu_read_lock(&mpam_srcu);
>> +	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
>> +				 srcu_read_lock_held(&mpam_srcu)) {
>> +		irq = platform_get_irq_byname_optional(msc->pdev, "error");
>> +		if (irq <= 0)
>> +			continue;
>> +
>> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_HW_ENABLED, &msc->error_irq_flags))
>> +			mpam_touch_msc(msc, mpam_disable_msc_ecr, msc);
>> +
>> +		if (test_and_clear_bit(MPAM_ERROR_IRQ_REQUESTED, &msc->error_irq_flags)) {
>> +			if (irq_is_percpu(irq)) {
>> +				msc->reenable_error_ppi = 0;
>> +				free_percpu_irq(irq, msc->error_dev_id);
>> +			} else {
>> +				devm_free_irq(&msc->pdev->dev, irq, msc);
>> +			}
>> +		}
>> +	}
>> +	srcu_read_unlock(&mpam_srcu, idx);
>> +	cpus_read_unlock();
>> +}

>> @@ -1332,6 +1580,27 @@ static void mpam_enable_once(void)
>>  	partid_max_published = true;
>>  	spin_unlock(&partid_max_lock);
>>  
>> +	/*
>> +	 * If all the MSC have been probed, enabling the IRQs happens next.
>> +	 * That involves cross-calling to a CPU that can reach the MSC, and
>> +	 * the locks must be taken in this order:
>> +	 */
>> +	cpus_read_lock();
>> +	mutex_lock(&mpam_list_lock);
>> +	mpam_enable_merge_features(&mpam_classes);
>> +
>> +	err = mpam_register_irqs();
>> +	if (err)
>> +		pr_warn("Failed to register irqs: %d\n", err);
> 
> Perhaps move the print into the if (err) below?

More types of error get later, and its maybe useful to know which of these failed.


>> diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
>> index 6e047fbd3512..f04a9ef189cf 100644
>> --- a/drivers/resctrl/mpam_internal.h
>> +++ b/drivers/resctrl/mpam_internal.h
>> @@ -32,6 +32,10 @@ struct mpam_garbage {
>>  	struct platform_device	*pdev;
>>  };
>>  
>> +/* Bit positions for error_irq_flags */
>> +#define	MPAM_ERROR_IRQ_REQUESTED  0
>> +#define	MPAM_ERROR_IRQ_HW_ENABLED 1
> 
> If there aren't going to be load more of these (I've not really thought
> about whether there might) then using a bitmap for these seems to add complexity
> that we wouldn't see with 
> bool error_irq_req;
> bool error_irq_hw_enabled;

It's a bitmap so that mpam_unregister_irqs() can use test_and_clear_bit() on them,
because with a real interrupt mpam_unregister_irqs() can run multiple times in parallel
with itself.
Doing this as bools would mean having a mutex to prevent that from happening.

I'll do that as its a slightly simpler.


Thanks,

James