The current device MSI infrastructure is subtly broken, as it
will issue an .msi_prepare() callback into the MSI controller
driver every time it needs to allocate an MSI. That's pretty wrong,
as the contract (or unwarranted assumption, depending who you ask)
between the MSI controller and the core code is that .msi_prepare()
is called exactly once per device.
This leads to some subtle breakage in said MSI controller drivers,
as it gives the impression that there are multiple endpoints sharing
a bus identifier (RID in PCI parlance, DID for GICv3+). It implies
that whatever allocation the ITS driver (for example) has done on
behalf of these devices cannot be undone, as there is no way to
track the shared state. This is particularly bad for wire-MSI devices,
for which .msi_prepare() is called for. each. input. line.
To address this issue, move the call to .msi_prepare() to take place
at the point of irq domain allocation, which is the only place that
makes sense. The msi_alloc_info_t structure is made part of the
msi_domain_template, so that its life-cycle is that of the domain
as well.
Finally, the msi_info::alloc_data field is made to point at this
allocation tracking structure, ensuring that it is carried around
the block.
This is all pretty straightforward, except for the non-device-MSI
leftovers, which still have to call .msi_prepare() at the old
spot. One day...
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
include/linux/msi.h | 2 ++
kernel/irq/msi.c | 35 +++++++++++++++++++++++++++++++----
2 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 63c23003ec9b7..ba1c77a829a1c 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -516,12 +516,14 @@ struct msi_domain_info {
* @chip: Interrupt chip for this domain
* @ops: MSI domain ops
* @info: MSI domain info data
+ * @alloc_info: MSI domain allocation data (arch specific)
*/
struct msi_domain_template {
char name[48];
struct irq_chip chip;
struct msi_domain_ops ops;
struct msi_domain_info info;
+ msi_alloc_info_t alloc_info;
};
/*
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 31378a2535fb9..07eb857efd15e 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -59,7 +59,8 @@ struct msi_ctrl {
static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
-
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
/**
* msi_alloc_desc - Allocate an initialized msi_desc
@@ -1023,6 +1024,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
bundle->info.ops = &bundle->ops;
bundle->info.data = domain_data;
bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
pops = parent->msi_parent_ops;
snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1061,11 +1063,18 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (!domain)
return false;
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
/* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
retain_and_null_ptr(bundle);
retain_and_null_ptr(fwnode_alloced);
- domain->dev = dev;
- dev->msi.data->__domains[domid].domain = domain;
return true;
}
@@ -1232,6 +1241,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
struct msi_ctrl *ctrl)
{
@@ -1244,7 +1271,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
--
2.39.2
Hi Marc,
On 2025/5/14 0:31, Marc Zyngier wrote:
> The current device MSI infrastructure is subtly broken, as it
> will issue an .msi_prepare() callback into the MSI controller
> driver every time it needs to allocate an MSI. That's pretty wrong,
> as the contract (or unwarranted assumption, depending who you ask)
> between the MSI controller and the core code is that .msi_prepare()
> is called exactly once per device.
>
> This leads to some subtle breakage in said MSI controller drivers,
> as it gives the impression that there are multiple endpoints sharing
> a bus identifier (RID in PCI parlance, DID for GICv3+). It implies
> that whatever allocation the ITS driver (for example) has done on
> behalf of these devices cannot be undone, as there is no way to
> track the shared state. This is particularly bad for wire-MSI devices,
> for which .msi_prepare() is called for. each. input. line.
>
> To address this issue, move the call to .msi_prepare() to take place
> at the point of irq domain allocation, which is the only place that
> makes sense. The msi_alloc_info_t structure is made part of the
> msi_domain_template, so that its life-cycle is that of the domain
> as well.
>
> Finally, the msi_info::alloc_data field is made to point at this
> allocation tracking structure, ensuring that it is carried around
> the block.
>
> This is all pretty straightforward, except for the non-device-MSI
> leftovers, which still have to call .msi_prepare() at the old
> spot. One day...
>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
> include/linux/msi.h | 2 ++
> kernel/irq/msi.c | 35 +++++++++++++++++++++++++++++++----
> 2 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 63c23003ec9b7..ba1c77a829a1c 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -516,12 +516,14 @@ struct msi_domain_info {
> * @chip: Interrupt chip for this domain
> * @ops: MSI domain ops
> * @info: MSI domain info data
> + * @alloc_info: MSI domain allocation data (arch specific)
> */
> struct msi_domain_template {
> char name[48];
> struct irq_chip chip;
> struct msi_domain_ops ops;
> struct msi_domain_info info;
> + msi_alloc_info_t alloc_info;
> };
>
> /*
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 31378a2535fb9..07eb857efd15e 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -59,7 +59,8 @@ struct msi_ctrl {
> static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
> static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
> static inline int msi_sysfs_create_group(struct device *dev);
> -
> +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
> + int nvec, msi_alloc_info_t *arg);
>
> /**
> * msi_alloc_desc - Allocate an initialized msi_desc
> @@ -1023,6 +1024,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> bundle->info.ops = &bundle->ops;
> bundle->info.data = domain_data;
> bundle->info.chip_data = chip_data;
> + bundle->info.alloc_data = &bundle->alloc_info;
>
> pops = parent->msi_parent_ops;
> snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
> @@ -1061,11 +1063,18 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> if (!domain)
> return false;
>
> + domain->dev = dev;
> + dev->msi.data->__domains[domid].domain = domain;
> +
> + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
Does it work for MSI? hwsize is 1 in the MSI case, without taking
pci_msi_vec_count() into account.
bool pci_setup_msi_device_domain(struct pci_dev *pdev)
{
[...]
return pci_create_device_domain(pdev, &pci_msi_template, 1);
Thanks,
Zenghui
Hi Zenghui,
On Tue, 03 Jun 2025 09:22:47 +0100,
Zenghui Yu <yuzenghui@huawei.com> wrote:
>
> > + domain->dev = dev;
> > + dev->msi.data->__domains[domid].domain = domain;
> > +
> > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
>
> Does it work for MSI? hwsize is 1 in the MSI case, without taking
> pci_msi_vec_count() into account.
>
> bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> {
> [...]
>
> return pci_create_device_domain(pdev, &pci_msi_template, 1);
Well spotted.
This looks like a PCI bug ignoring Multi-MSI. Can you give the
following a go and let people know whether that fixes your issue?
Thanks,
M.
diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index d7ba8795d60f..89677a21d525 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -287,7 +287,7 @@ static bool pci_create_device_domain(struct pci_dev *pdev, const struct msi_doma
* - The device is removed
* - MSI is disabled and a MSI-X domain is created
*/
-bool pci_setup_msi_device_domain(struct pci_dev *pdev)
+bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize)
{
if (WARN_ON_ONCE(pdev->msix_enabled))
return false;
@@ -297,7 +297,7 @@ bool pci_setup_msi_device_domain(struct pci_dev *pdev)
if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX))
msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN);
- return pci_create_device_domain(pdev, &pci_msi_template, 1);
+ return pci_create_device_domain(pdev, &pci_msi_template, hwsize);
}
/**
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 8b8848788618..81891701840a 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -449,7 +449,7 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
if (rc)
return rc;
- if (!pci_setup_msi_device_domain(dev))
+ if (!pci_setup_msi_device_domain(dev, nvec))
return -ENODEV;
for (;;) {
diff --git a/drivers/pci/msi/msi.h b/drivers/pci/msi/msi.h
index ee53cf079f4e..3ab898af88a7 100644
--- a/drivers/pci/msi/msi.h
+++ b/drivers/pci/msi/msi.h
@@ -107,7 +107,7 @@ enum support_mode {
};
bool pci_msi_domain_supports(struct pci_dev *dev, unsigned int feature_mask, enum support_mode mode);
-bool pci_setup_msi_device_domain(struct pci_dev *pdev);
+bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize);
bool pci_setup_msix_device_domain(struct pci_dev *pdev, unsigned int hwsize);
/* Legacy (!IRQDOMAIN) fallbacks */
--
Jazz isn't dead. It just smells funny.
On Tue, Jun 03, 2025 at 01:50:47PM +0100, Marc Zyngier wrote:
> Hi Zenghui,
>
> On Tue, 03 Jun 2025 09:22:47 +0100,
> Zenghui Yu <yuzenghui@huawei.com> wrote:
> >
> > > + domain->dev = dev;
> > > + dev->msi.data->__domains[domid].domain = domain;
> > > +
> > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> >
> > Does it work for MSI? hwsize is 1 in the MSI case, without taking
> > pci_msi_vec_count() into account.
> >
> > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > {
> > [...]
> >
> > return pci_create_device_domain(pdev, &pci_msi_template, 1);
>
> Well spotted.
>
> This looks like a PCI bug ignoring Multi-MSI. Can you give the
> following a go and let people know whether that fixes your issue?
>
> Thanks,
>
> M.
>
> diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
> index d7ba8795d60f..89677a21d525 100644
> --- a/drivers/pci/msi/irqdomain.c
> +++ b/drivers/pci/msi/irqdomain.c
> @@ -287,7 +287,7 @@ static bool pci_create_device_domain(struct pci_dev *pdev, const struct msi_doma
> * - The device is removed
> * - MSI is disabled and a MSI-X domain is created
> */
> -bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> +bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize)
> {
> if (WARN_ON_ONCE(pdev->msix_enabled))
> return false;
> @@ -297,7 +297,7 @@ bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX))
> msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN);
>
> - return pci_create_device_domain(pdev, &pci_msi_template, 1);
> + return pci_create_device_domain(pdev, &pci_msi_template, hwsize);
> }
>
> /**
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 8b8848788618..81891701840a 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -449,7 +449,7 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> if (rc)
> return rc;
>
> - if (!pci_setup_msi_device_domain(dev))
> + if (!pci_setup_msi_device_domain(dev, nvec))
If pci_msi_vec_count(dev) > maxvec we would cap nvec and size the
domain with the capped value.
In __pci_enable_msix_range() we are sizing the device according to
pci_msix_vec_count(dev) regardless of maxvec, if I read the code correctly.
While fixing it it would be good to make them consistent unless there is
a reason why they should not.
Lorenzo
> return -ENODEV;
>
> for (;;) {
> diff --git a/drivers/pci/msi/msi.h b/drivers/pci/msi/msi.h
> index ee53cf079f4e..3ab898af88a7 100644
> --- a/drivers/pci/msi/msi.h
> +++ b/drivers/pci/msi/msi.h
> @@ -107,7 +107,7 @@ enum support_mode {
> };
>
> bool pci_msi_domain_supports(struct pci_dev *dev, unsigned int feature_mask, enum support_mode mode);
> -bool pci_setup_msi_device_domain(struct pci_dev *pdev);
> +bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize);
> bool pci_setup_msix_device_domain(struct pci_dev *pdev, unsigned int hwsize);
>
> /* Legacy (!IRQDOMAIN) fallbacks */
>
> --
> Jazz isn't dead. It just smells funny.
On Tue, 03 Jun 2025 14:29:58 +0100,
Lorenzo Pieralisi <lpieralisi@kernel.org> wrote:
>
> On Tue, Jun 03, 2025 at 01:50:47PM +0100, Marc Zyngier wrote:
> > Hi Zenghui,
> >
> > On Tue, 03 Jun 2025 09:22:47 +0100,
> > Zenghui Yu <yuzenghui@huawei.com> wrote:
> > >
> > > > + domain->dev = dev;
> > > > + dev->msi.data->__domains[domid].domain = domain;
> > > > +
> > > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> > >
> > > Does it work for MSI? hwsize is 1 in the MSI case, without taking
> > > pci_msi_vec_count() into account.
> > >
> > > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > > {
> > > [...]
> > >
> > > return pci_create_device_domain(pdev, &pci_msi_template, 1);
> >
> > Well spotted.
> >
> > This looks like a PCI bug ignoring Multi-MSI. Can you give the
> > following a go and let people know whether that fixes your issue?
> >
> > Thanks,
> >
> > M.
> >
> > diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
> > index d7ba8795d60f..89677a21d525 100644
> > --- a/drivers/pci/msi/irqdomain.c
> > +++ b/drivers/pci/msi/irqdomain.c
> > @@ -287,7 +287,7 @@ static bool pci_create_device_domain(struct pci_dev *pdev, const struct msi_doma
> > * - The device is removed
> > * - MSI is disabled and a MSI-X domain is created
> > */
> > -bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > +bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize)
> > {
> > if (WARN_ON_ONCE(pdev->msix_enabled))
> > return false;
> > @@ -297,7 +297,7 @@ bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX))
> > msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN);
> >
> > - return pci_create_device_domain(pdev, &pci_msi_template, 1);
> > + return pci_create_device_domain(pdev, &pci_msi_template, hwsize);
> > }
> >
> > /**
> > diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> > index 8b8848788618..81891701840a 100644
> > --- a/drivers/pci/msi/msi.c
> > +++ b/drivers/pci/msi/msi.c
> > @@ -449,7 +449,7 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> > if (rc)
> > return rc;
> >
> > - if (!pci_setup_msi_device_domain(dev))
> > + if (!pci_setup_msi_device_domain(dev, nvec))
>
> If pci_msi_vec_count(dev) > maxvec we would cap nvec and size the
> domain with the capped value.
>
> In __pci_enable_msix_range() we are sizing the device according to
> pci_msix_vec_count(dev) regardless of maxvec, if I read the code correctly.
>
> While fixing it it would be good to make them consistent unless there is
> a reason why they should not.
This is indeed odd, but that'd be a separate fix. Something like:
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 2090eef64b14..6ede55a7c5e6 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -439,9 +439,6 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
if (nvec < minvec)
return -ENOSPC;
- if (nvec > maxvec)
- nvec = maxvec;
-
rc = pci_setup_msi_context(dev);
if (rc)
return rc;
@@ -449,6 +446,9 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
if (!pci_setup_msi_device_domain(dev, nvec))
return -ENODEV;
+ if (nvec > maxvec)
+ nvec = maxvec;
+
for (;;) {
if (affd) {
nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
--
Jazz isn't dead. It just smells funny.
On Tue, 03 Jun 2025 15:03:26 +0100,
Marc Zyngier <maz@kernel.org> wrote:
>
> On Tue, 03 Jun 2025 14:29:58 +0100,
> Lorenzo Pieralisi <lpieralisi@kernel.org> wrote:
> >
> > > + if (!pci_setup_msi_device_domain(dev, nvec))
> >
> > If pci_msi_vec_count(dev) > maxvec we would cap nvec and size the
> > domain with the capped value.
> >
> > In __pci_enable_msix_range() we are sizing the device according to
> > pci_msix_vec_count(dev) regardless of maxvec, if I read the code correctly.
> >
> > While fixing it it would be good to make them consistent unless there is
> > a reason why they should not.
>
> This is indeed odd, but that'd be a separate fix. Something like:
>
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 2090eef64b14..6ede55a7c5e6 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -439,9 +439,6 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> if (nvec < minvec)
> return -ENOSPC;
>
> - if (nvec > maxvec)
> - nvec = maxvec;
> -
> rc = pci_setup_msi_context(dev);
> if (rc)
> return rc;
> @@ -449,6 +446,9 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> if (!pci_setup_msi_device_domain(dev, nvec))
> return -ENODEV;
>
> + if (nvec > maxvec)
> + nvec = maxvec;
> +
> for (;;) {
> if (affd) {
> nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
>
Actually, let's lump the two together. There is no reason to wait.
M.
--
Jazz isn't dead. It just smells funny.
Hi Marc,
On 2025/6/3 20:50, Marc Zyngier wrote:
> Hi Zenghui,
>
> On Tue, 03 Jun 2025 09:22:47 +0100,
> Zenghui Yu <yuzenghui@huawei.com> wrote:
> >
> > > + domain->dev = dev;
> > > + dev->msi.data->__domains[domid].domain = domain;
> > > +
> > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> >
> > Does it work for MSI? hwsize is 1 in the MSI case, without taking
> > pci_msi_vec_count() into account.
> >
> > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > {
> > [...]
> >
> > return pci_create_device_domain(pdev, &pci_msi_template, 1);
>
> Well spotted.
>
> This looks like a PCI bug ignoring Multi-MSI. Can you give the
> following a go and let people know whether that fixes your issue?
I hit this problem on Kunpeng920 with some HiSilicon SAS (Serial
Attached SCSI controller) on it. These controllers are MSI-capable and
didn't work after this commit.
# lspci -v -s 74:02.0
74:02.0 Serial Attached SCSI controller: Huawei Technologies Co., Ltd.
HiSilicon SAS 3.0 HBA (rev 21)
Flags: bus master, fast devsel, latency 0, IRQ 42, NUMA node 0, IOMMU
group 27
Memory at a2000000 (32-bit, non-prefetchable) [size=32K]
Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
Capabilities: [80] MSI: Enable+ Count=32/32 Maskable+ 64bit+
Capabilities: [b0] Power Management version 3
Kernel driver in use: hisi_sas_v3_hw
> diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
> index d7ba8795d60f..89677a21d525 100644
> --- a/drivers/pci/msi/irqdomain.c
> +++ b/drivers/pci/msi/irqdomain.c
> @@ -287,7 +287,7 @@ static bool pci_create_device_domain(struct pci_dev *pdev, const struct msi_doma
> * - The device is removed
> * - MSI is disabled and a MSI-X domain is created
> */
> -bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> +bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize)
> {
> if (WARN_ON_ONCE(pdev->msix_enabled))
> return false;
> @@ -297,7 +297,7 @@ bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX))
> msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN);
>
> - return pci_create_device_domain(pdev, &pci_msi_template, 1);
> + return pci_create_device_domain(pdev, &pci_msi_template, hwsize);
> }
>
> /**
> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 8b8848788618..81891701840a 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -449,7 +449,7 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> if (rc)
> return rc;
>
> - if (!pci_setup_msi_device_domain(dev))
> + if (!pci_setup_msi_device_domain(dev, nvec))
> return -ENODEV;
>
> for (;;) {
> diff --git a/drivers/pci/msi/msi.h b/drivers/pci/msi/msi.h
> index ee53cf079f4e..3ab898af88a7 100644
> --- a/drivers/pci/msi/msi.h
> +++ b/drivers/pci/msi/msi.h
> @@ -107,7 +107,7 @@ enum support_mode {
> };
>
> bool pci_msi_domain_supports(struct pci_dev *dev, unsigned int feature_mask, enum support_mode mode);
> -bool pci_setup_msi_device_domain(struct pci_dev *pdev);
> +bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize);
> bool pci_setup_msix_device_domain(struct pci_dev *pdev, unsigned int hwsize);
>
> /* Legacy (!IRQDOMAIN) fallbacks */
I have the exact same diff to get my box to work again ;-)
Tested-by: Zenghui Yu <yuzenghui@huawei.com>
Thanks for your fix!
Zenghui
On Tue, 03 Jun 2025 14:07:04 +0100,
Zenghui Yu <yuzenghui@huawei.com> wrote:
>
> Hi Marc,
>
> On 2025/6/3 20:50, Marc Zyngier wrote:
> > Hi Zenghui,
> >
> > On Tue, 03 Jun 2025 09:22:47 +0100,
> > Zenghui Yu <yuzenghui@huawei.com> wrote:
> > >
> > > > + domain->dev = dev;
> > > > + dev->msi.data->__domains[domid].domain = domain;
> > > > +
> > > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> > >
> > > Does it work for MSI? hwsize is 1 in the MSI case, without taking
> > > pci_msi_vec_count() into account.
> > >
> > > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > > {
> > > [...]
> > >
> > > return pci_create_device_domain(pdev, &pci_msi_template, 1);
> >
> > Well spotted.
> >
> > This looks like a PCI bug ignoring Multi-MSI. Can you give the
> > following a go and let people know whether that fixes your issue?
>
> I hit this problem on Kunpeng920 with some HiSilicon SAS (Serial
> Attached SCSI controller) on it. These controllers are MSI-capable and
> didn't work after this commit.
That's interesting. It means that the sizing of the irqdomain to 1
wasn't getting in the way, even if that was obviously wrong. The funny
thing is that the msi_desc would still be OK, as we only have one for
MSI, no matter how many vectors are used.
>
> I have the exact same diff to get my box to work again ;-)
Ah ;-)
>
> Tested-by: Zenghui Yu <yuzenghui@huawei.com>
>
> Thanks for your fix!
Thank you!
M.
--
Jazz isn't dead. It just smells funny.
On Tue, Jun 03, 2025 at 04:22:47PM +0800, Zenghui Yu wrote:
> Hi Marc,
>
> On 2025/5/14 0:31, Marc Zyngier wrote:
> > The current device MSI infrastructure is subtly broken, as it
> > will issue an .msi_prepare() callback into the MSI controller
> > driver every time it needs to allocate an MSI. That's pretty wrong,
> > as the contract (or unwarranted assumption, depending who you ask)
> > between the MSI controller and the core code is that .msi_prepare()
> > is called exactly once per device.
> >
> > This leads to some subtle breakage in said MSI controller drivers,
> > as it gives the impression that there are multiple endpoints sharing
> > a bus identifier (RID in PCI parlance, DID for GICv3+). It implies
> > that whatever allocation the ITS driver (for example) has done on
> > behalf of these devices cannot be undone, as there is no way to
> > track the shared state. This is particularly bad for wire-MSI devices,
> > for which .msi_prepare() is called for. each. input. line.
> >
> > To address this issue, move the call to .msi_prepare() to take place
> > at the point of irq domain allocation, which is the only place that
> > makes sense. The msi_alloc_info_t structure is made part of the
> > msi_domain_template, so that its life-cycle is that of the domain
> > as well.
> >
> > Finally, the msi_info::alloc_data field is made to point at this
> > allocation tracking structure, ensuring that it is carried around
> > the block.
> >
> > This is all pretty straightforward, except for the non-device-MSI
> > leftovers, which still have to call .msi_prepare() at the old
> > spot. One day...
> >
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> > include/linux/msi.h | 2 ++
> > kernel/irq/msi.c | 35 +++++++++++++++++++++++++++++++----
> > 2 files changed, 33 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/msi.h b/include/linux/msi.h
> > index 63c23003ec9b7..ba1c77a829a1c 100644
> > --- a/include/linux/msi.h
> > +++ b/include/linux/msi.h
> > @@ -516,12 +516,14 @@ struct msi_domain_info {
> > * @chip: Interrupt chip for this domain
> > * @ops: MSI domain ops
> > * @info: MSI domain info data
> > + * @alloc_info: MSI domain allocation data (arch specific)
> > */
> > struct msi_domain_template {
> > char name[48];
> > struct irq_chip chip;
> > struct msi_domain_ops ops;
> > struct msi_domain_info info;
> > + msi_alloc_info_t alloc_info;
> > };
> >
> > /*
> > diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> > index 31378a2535fb9..07eb857efd15e 100644
> > --- a/kernel/irq/msi.c
> > +++ b/kernel/irq/msi.c
> > @@ -59,7 +59,8 @@ struct msi_ctrl {
> > static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
> > static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
> > static inline int msi_sysfs_create_group(struct device *dev);
> > -
> > +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
> > + int nvec, msi_alloc_info_t *arg);
> >
> > /**
> > * msi_alloc_desc - Allocate an initialized msi_desc
> > @@ -1023,6 +1024,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> > bundle->info.ops = &bundle->ops;
> > bundle->info.data = domain_data;
> > bundle->info.chip_data = chip_data;
> > + bundle->info.alloc_data = &bundle->alloc_info;
> >
> > pops = parent->msi_parent_ops;
> > snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
> > @@ -1061,11 +1063,18 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
> > if (!domain)
> > return false;
> >
> > + domain->dev = dev;
> > + dev->msi.data->__domains[domid].domain = domain;
> > +
> > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
>
> Does it work for MSI?
This means that it does not work for MSI for you as it stands, right ?
If you spotted an issue, thanks for that, report it fully please.
> hwsize is 1 in the MSI case, without taking pci_msi_vec_count() into account.
>
> bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> {
> [...]
>
> return pci_create_device_domain(pdev, &pci_msi_template, 1);
I had a stab at it with GICv5 models and an MSI capable device and this indeed
calls the ITS msi_prepare() callback with 1 as vector count, so we size
the device tables wrongly.
The question is why pci_create_device_domain() is called here with
hwsize == 1. Probably, before this series, the ITS MSI parent code was
fixing the size up so we did not notice, I need to check.
Lorenzo
On Tue, 03 Jun 2025 10:35:51 +0100,
Lorenzo Pieralisi <lpieralisi@kernel.org> wrote:
>
> On Tue, Jun 03, 2025 at 04:22:47PM +0800, Zenghui Yu wrote:
> > > + domain->dev = dev;
> > > + dev->msi.data->__domains[domid].domain = domain;
> > > +
> > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> >
> > Does it work for MSI?
>
> This means that it does not work for MSI for you as it stands, right ?
>
> If you spotted an issue, thanks for that, report it fully please.
Honestly, you're barking up the wrong tree. Zenghui points us to a
glaring bug in the core code, with detailed information on what could
go wrong, as well as what is wrong in the code. It doesn't get better
than that.
The usual level of bug reports is "its b0rken", sometimes followed by
a trace with lots of hex and no information. Spot the difference?
>
> > hwsize is 1 in the MSI case, without taking pci_msi_vec_count() into account.
> >
> > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > {
> > [...]
> >
> > return pci_create_device_domain(pdev, &pci_msi_template, 1);
>
> I had a stab at it with GICv5 models and an MSI capable device and this indeed
> calls the ITS msi_prepare() callback with 1 as vector count, so we size
> the device tables wrongly.
Not wrongly. Exactly as instructed.
>
> The question is why pci_create_device_domain() is called here with
> hwsize == 1. Probably, before this series, the ITS MSI parent code was
> fixing the size up so we did not notice, I need to check.
The GICv3 ITS code would upgrade the vector count to the next power of
two (one bit of EID space -> 2 MSIs), but with the device domain
squarely set to 1, the endpoint driver would never get more. It is
prepared to fail gracefully though, hence nothing really breaks.
I don't think this patch makes anything regress though. Commit
15c72f824b327 seems to be the offending one. If Zenghui confirms that
the hack I posted separately works for him, I'll follow up with a
"real" patch.
M.
--
Jazz isn't dead. It just smells funny.
On Tue, Jun 03, 2025 at 02:09:50PM +0100, Marc Zyngier wrote:
> On Tue, 03 Jun 2025 10:35:51 +0100,
> Lorenzo Pieralisi <lpieralisi@kernel.org> wrote:
> >
> > On Tue, Jun 03, 2025 at 04:22:47PM +0800, Zenghui Yu wrote:
> > > > + domain->dev = dev;
> > > > + dev->msi.data->__domains[domid].domain = domain;
> > > > +
> > > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> > >
> > > Does it work for MSI?
> >
> > This means that it does not work for MSI for you as it stands, right ?
> >
> > If you spotted an issue, thanks for that, report it fully please.
>
> Honestly, you're barking up the wrong tree. Zenghui points us to a
> glaring bug in the core code, with detailed information on what could
> go wrong, as well as what is wrong in the code. It doesn't get better
> than that.
>
> The usual level of bug reports is "its b0rken", sometimes followed by
> a trace with lots of hex and no information. Spot the difference?
Agreed, thanks again Zenghui for reporting it and forgive me if the
message sounded a bit patronizing, I did not mean it.
Lorenzo
> > > hwsize is 1 in the MSI case, without taking pci_msi_vec_count() into account.
> > >
> > > bool pci_setup_msi_device_domain(struct pci_dev *pdev)
> > > {
> > > [...]
> > >
> > > return pci_create_device_domain(pdev, &pci_msi_template, 1);
> >
> > I had a stab at it with GICv5 models and an MSI capable device and this indeed
> > calls the ITS msi_prepare() callback with 1 as vector count, so we size
> > the device tables wrongly.
>
> Not wrongly. Exactly as instructed.
>
> >
> > The question is why pci_create_device_domain() is called here with
> > hwsize == 1. Probably, before this series, the ITS MSI parent code was
> > fixing the size up so we did not notice, I need to check.
>
> The GICv3 ITS code would upgrade the vector count to the next power of
> two (one bit of EID space -> 2 MSIs), but with the device domain
> squarely set to 1, the endpoint driver would never get more. It is
> prepared to fail gracefully though, hence nothing really breaks.
>
> I don't think this patch makes anything regress though. Commit
> 15c72f824b327 seems to be the offending one. If Zenghui confirms that
> the hack I posted separately works for him, I'll follow up with a
> "real" patch.
>
> M.
>
> --
> Jazz isn't dead. It just smells funny.
On 2025/6/3 22:37, Lorenzo Pieralisi wrote:
> On Tue, Jun 03, 2025 at 02:09:50PM +0100, Marc Zyngier wrote:
> > On Tue, 03 Jun 2025 10:35:51 +0100,
> > Lorenzo Pieralisi <lpieralisi@kernel.org> wrote:
> > >
> > > On Tue, Jun 03, 2025 at 04:22:47PM +0800, Zenghui Yu wrote:
> > > > > + domain->dev = dev;
> > > > > + dev->msi.data->__domains[domid].domain = domain;
> > > > > +
> > > > > + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
> > > >
> > > > Does it work for MSI?
> > >
> > > This means that it does not work for MSI for you as it stands, right ?
> > >
> > > If you spotted an issue, thanks for that, report it fully please.
> >
> > Honestly, you're barking up the wrong tree. Zenghui points us to a
> > glaring bug in the core code, with detailed information on what could
> > go wrong, as well as what is wrong in the code. It doesn't get better
> > than that.
> >
> > The usual level of bug reports is "its b0rken", sometimes followed by
> > a trace with lots of hex and no information. Spot the difference?
>
> Agreed, thanks again Zenghui for reporting it and forgive me if the
> message sounded a bit patronizing, I did not mean it.
Never mind :-)
Zenghui
The following commit has been merged into the irq/msi branch of tip:
Commit-ID: 1396e89e09f00deb816e5f4a176d71d554922292
Gitweb: https://git.kernel.org/tip/1396e89e09f00deb816e5f4a176d71d554922292
Author: Marc Zyngier <maz@kernel.org>
AuthorDate: Tue, 13 May 2025 17:31:42 +01:00
Committer: Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Wed, 14 May 2025 12:36:41 +02:00
genirq/msi: Move prepare() call to per-device allocation
The current device MSI infrastructure is subtly broken, as it will issue an
.msi_prepare() callback into the MSI controller driver every time it needs
to allocate an MSI. That's pretty wrong, as the contract (or unwarranted
assumption, depending who you ask) between the MSI controller and the core
code is that .msi_prepare() is called exactly once per device.
This leads to some subtle breakage in some MSI controller drivers, as it
gives the impression that there are multiple endpoints sharing a bus
identifier (RID in PCI parlance, DID for GICv3+). It implies that whatever
allocation the ITS driver (for example) has done on behalf of these devices
cannot be undone, as there is no way to track the shared state. This is
particularly bad for wire-MSI devices, for which .msi_prepare() is called
for each input line.
To address this issue, move the call to .msi_prepare() to take place at the
point of irq domain allocation, which is the only place that makes
sense. The msi_alloc_info_t structure is made part of the
msi_domain_template, so that its life-cycle is that of the domain as well.
Finally, the msi_info::alloc_data field is made to point at this allocation
tracking structure, ensuring that it is carried around the block.
This is all pretty straightforward, except for the non-device-MSI
leftovers, which still have to call .msi_prepare() at the old spot. One
day...
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250513163144.2215824-4-maz@kernel.org
---
include/linux/msi.h | 2 ++
kernel/irq/msi.c | 35 +++++++++++++++++++++++++++++++----
2 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 63c2300..f4b94cc 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -516,12 +516,14 @@ struct msi_domain_info {
* @chip: Interrupt chip for this domain
* @ops: MSI domain ops
* @info: MSI domain info data
+ * @alloc_info: MSI domain allocation data (architecture specific)
*/
struct msi_domain_template {
char name[48];
struct irq_chip chip;
struct msi_domain_ops ops;
struct msi_domain_info info;
+ msi_alloc_info_t alloc_info;
};
/*
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 00f4d87..1098f26 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -59,7 +59,8 @@ struct msi_ctrl {
static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
-
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
/**
* msi_alloc_desc - Allocate an initialized msi_desc
@@ -1023,6 +1024,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
bundle->info.ops = &bundle->ops;
bundle->info.data = domain_data;
bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
pops = parent->msi_parent_ops;
snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1061,11 +1063,18 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (!domain)
return false;
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
/* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
retain_and_null_ptr(bundle);
retain_and_null_ptr(fwnode_alloced);
- domain->dev = dev;
- dev->msi.data->__domains[domid].domain = domain;
return true;
}
@@ -1232,6 +1241,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
struct msi_ctrl *ctrl)
{
@@ -1244,7 +1271,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
© 2016 - 2025 Red Hat, Inc.