[PATCH 3/4] genirq/msi: Move prepare() call to per-device allocation

Marc Zyngier posted 4 patches 7 months, 1 week ago
There is a newer version of this series
[PATCH 3/4] genirq/msi: Move prepare() call to per-device allocation
Posted by Marc Zyngier 7 months, 1 week ago
The current device MSI infrastructure is subtly broken, as it
will issue an .msi_prepare() callback into the MSI controller
driver every time it needs to allocate an MSI. That's pretty wrong,
as the contract between the MSI controller and the core code is that
.msi_prepare() is called exactly once per device.

This leads to some subtle breakage in said MSI controller drivers,
as it gives the impression that there are multiple endpoints sharing
a bus identifier (RID in PCI parlance, DID for GICv3+). It implies
that whatever allocation the ITS driver (for example) has done on
behalf of these devices cannot be undone, as there is no way to
track the shared state. This is particularly bad for wire-MSI devices,
for which .msi_prepare() is called for. each. input. line.

To address this issue, move the call to .msi_prepare() to take place
at the point of irq domain allocation, which is the only place that
makes sense. The msi_alloc_info_t structure is made part of the
msi_domain_template, so that its life-cycle is that of the domain
as well.

Finally, the msi_info::alloc_data field is made to point at this
allocation tracking structure, ensuring that it is carried around
the block.

This is all pretty straightforward, except for the non-device-MSI
leftovers, which still have to call .msi_prepare() at the old
spot. One day...

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0a44a2cba3105..68a8b2d03eba9 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -513,12 +513,14 @@ struct msi_domain_info {
  * @chip:	Interrupt chip for this domain
  * @ops:	MSI domain ops
  * @info:	MSI domain info data
+ * @arg:	MSI domain allocation data (arch specific)
  */
 struct msi_domain_template {
 	char			name[48];
 	struct irq_chip		chip;
 	struct msi_domain_ops	ops;
 	struct msi_domain_info	info;
+	msi_alloc_info_t	arg;
 };
 
 /*
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index a65ccf19b15d9..b8dc3289958c6 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -59,7 +59,8 @@ struct msi_ctrl {
 static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
 static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
 static inline int msi_sysfs_create_group(struct device *dev);
-
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+				   int nvec, msi_alloc_info_t *arg);
 
 /**
  * msi_alloc_desc - Allocate an initialized msi_desc
@@ -1025,6 +1026,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
 	bundle->info.ops = &bundle->ops;
 	bundle->info.data = domain_data;
 	bundle->info.chip_data = chip_data;
+	bundle->info.alloc_data = &bundle->arg;
 
 	pops = parent->msi_parent_ops;
 	snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1053,21 +1055,28 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
 	msi_lock_descs(dev);
 
 	if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
-		goto fail;
+		goto fail_unlock;
 
 	if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
-		goto fail;
+		goto fail_unlock;
 
 	domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
 	if (!domain)
-		goto fail;
+		goto fail_unlock;
 
 	domain->dev = dev;
 	dev->msi.data->__domains[domid].domain = domain;
+
+	if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->arg))
+		goto fail;
+
 	msi_unlock_descs(dev);
 	return true;
 
 fail:
+	dev->msi.data->__domains[domid].domain = NULL;
+	irq_domain_remove(domain);
+fail_unlock:
 	msi_unlock_descs(dev);
 free_fwnode:
 	irq_domain_free_fwnode(fwnalloced);
@@ -1250,6 +1259,26 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
 	return 0;
 }
 
+static int __populate_alloc_info(struct irq_domain *domain, struct device *dev,
+				 unsigned int nirqs, msi_alloc_info_t *arg)
+{
+	struct msi_domain_info *info = domain->host_data;
+	int ret = 0;
+
+	/*
+	 * If the caller has provided a template alloc info, use that. Once
+	 * all users of msi_create_irq_domain() have been eliminated, this
+	 * should be the only source of allocation information, and the
+	 * prepare call below should be finally removed.
+	 */
+	if (info->alloc_data)
+		*arg = *info->alloc_data;
+	else
+		ret = msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+	return ret;
+}
+
 static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
 				   struct msi_ctrl *ctrl)
 {
@@ -1262,7 +1291,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
 	unsigned long idx;
 	int i, ret, virq;
 
-	ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
+	ret = __populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
 	if (ret)
 		return ret;
 
-- 
2.39.2
Re: [PATCH 3/4] genirq/msi: Move prepare() call to per-device allocation
Posted by Thomas Gleixner 7 months, 1 week ago
On Sun, May 11 2025 at 17:35, Marc Zyngier wrote:
> The current device MSI infrastructure is subtly broken, as it
> will issue an .msi_prepare() callback into the MSI controller
> driver every time it needs to allocate an MSI. That's pretty wrong,
> as the contract between the MSI controller and the core code is that
> .msi_prepare() is called exactly once per device.

That contract is nowhere written in stone.

There are some MSI controller which get confused about that, but that's
a problem of said controllers

> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 0a44a2cba3105..68a8b2d03eba9 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -513,12 +513,14 @@ struct msi_domain_info {
>   * @chip:	Interrupt chip for this domain
>   * @ops:	MSI domain ops
>   * @info:	MSI domain info data
> + * @arg:	MSI domain allocation data (arch specific)

arg is a horrible name. Can this please be alloc_info or such?

> @@ -1025,6 +1026,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
>  	bundle->info.ops = &bundle->ops;
>  	bundle->info.data = domain_data;
>  	bundle->info.chip_data = chip_data;
> +	bundle->info.alloc_data = &bundle->arg;
>  
>  	pops = parent->msi_parent_ops;
>  	snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
> @@ -1053,21 +1055,28 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
>  	msi_lock_descs(dev);

Please work against tip irq/msi which carries the guard() replacement
for msi_lock_descs(). This patch heavily conflicts with the queued
changes.

> +static int __populate_alloc_info(struct irq_domain *domain, struct device *dev,
> +				 unsigned int nirqs, msi_alloc_info_t *arg)
> +{

Why does this need double underscores?

> +	struct msi_domain_info *info = domain->host_data;
> +	int ret = 0;
> +
> +	/*
> +	 * If the caller has provided a template alloc info, use that. Once
> +	 * all users of msi_create_irq_domain() have been eliminated, this
> +	 * should be the only source of allocation information, and the
> +	 * prepare call below should be finally removed.

That's only a matter of decades :)

> +	 */
> +	if (info->alloc_data)
> +		*arg = *info->alloc_data;
> +	else
> +		ret = msi_domain_prepare_irqs(domain, dev, nirqs, arg);
> +
> +	return ret;

	if (!info->alloc_data)
        	return msi_domain_prepare_irqs(domain, dev, nirqs, arg);

	*arg = *info->alloc_data;
        return 0;

perhaps?

Thanks,

        tglx
Re: [PATCH 3/4] genirq/msi: Move prepare() call to per-device allocation
Posted by Marc Zyngier 7 months, 1 week ago
On Mon, 12 May 2025 15:24:39 +0100,
Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Sun, May 11 2025 at 17:35, Marc Zyngier wrote:
> > The current device MSI infrastructure is subtly broken, as it
> > will issue an .msi_prepare() callback into the MSI controller
> > driver every time it needs to allocate an MSI. That's pretty wrong,
> > as the contract between the MSI controller and the core code is that
> > .msi_prepare() is called exactly once per device.
> 
> That contract is nowhere written in stone.

It was *definitely* there the first place, and a baked in assumption
since the ITS code was merged. You're welcome to come up with a new
scheme, but the way the HW works requires this prepare phase to take
place once per device.

If we can't have that, maybe we should consider reverting the GICv3/v4
code back to the pre-6.10 scheme that doesn't suffer from this issue.

> There are some MSI controller which get confused about that, but that's
> a problem of said controllers

No. It's an infrastructure problem. This model worked before for a
whole class of HW, until it was mutated into something else.

> 
> > diff --git a/include/linux/msi.h b/include/linux/msi.h
> > index 0a44a2cba3105..68a8b2d03eba9 100644
> > --- a/include/linux/msi.h
> > +++ b/include/linux/msi.h
> > @@ -513,12 +513,14 @@ struct msi_domain_info {
> >   * @chip:	Interrupt chip for this domain
> >   * @ops:	MSI domain ops
> >   * @info:	MSI domain info data
> > + * @arg:	MSI domain allocation data (arch specific)
> 
> arg is a horrible name. Can this please be alloc_info or such?

Because that's the name every single function that takes it as a
parameter uses? But sure, whatever name you want.

> 
> > @@ -1025,6 +1026,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> >  	bundle->info.ops = &bundle->ops;
> >  	bundle->info.data = domain_data;
> >  	bundle->info.chip_data = chip_data;
> > +	bundle->info.alloc_data = &bundle->arg;
> >  
> >  	pops = parent->msi_parent_ops;
> >  	snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
> > @@ -1053,21 +1055,28 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> >  	msi_lock_descs(dev);
> 
> Please work against tip irq/msi which carries the guard() replacement
> for msi_lock_descs(). This patch heavily conflicts with the queued
> changes.
> 
> > +static int __populate_alloc_info(struct irq_domain *domain, struct device *dev,
> > +				 unsigned int nirqs, msi_alloc_info_t *arg)
> > +{
> 
> Why does this need double underscores?

Because it doesn't look that out of place in this file?

> 
> > +	struct msi_domain_info *info = domain->host_data;
> > +	int ret = 0;
> > +
> > +	/*
> > +	 * If the caller has provided a template alloc info, use that. Once
> > +	 * all users of msi_create_irq_domain() have been eliminated, this
> > +	 * should be the only source of allocation information, and the
> > +	 * prepare call below should be finally removed.
> 
> That's only a matter of decades :)
>
> > +	 */
> > +	if (info->alloc_data)
> > +		*arg = *info->alloc_data;
> > +	else
> > +		ret = msi_domain_prepare_irqs(domain, dev, nirqs, arg);
> > +
> > +	return ret;
> 
> 	if (!info->alloc_data)
>         	return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
> 
> 	*arg = *info->alloc_data;
>         return 0;
> 
> perhaps?

Sure.

	M.

-- 
Without deviation from the norm, progress is not possible.