arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-)
Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
when the device's MSI-X table size is larger than the firmware's MSI quota
for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
- Before, it is called when interrupts are allocated at the global
interrupt domain with nvec_in being the number of allocated interrupts.
rtas_prepare_msi_irqs() can return a positive number and the allocation
will be retried.
- Now, it is called at the creation of per-device interrupt domain with
nvec_in being the number of interrupts that the device supports. If
rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
number (the quota). This causes per-device interrupt domain creation to
fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
- pseries_msi_ops_prepare() only prepares as many interrupts as the quota
permit.
- pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
as allowed by the quota. If device drivers attempt to allocate more
interrupts than the quota permits, pseries_irq_domain_alloc() will return
an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff <nilay@linux.ibm.com>
Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
Signed-off-by: Nam Cao <namcao@linutronix.de>
Acked-by: Nilay Shroff <nilay@linux.ibm.com>
Cc: stable@vger.kernel.org
---
v2:
- change pseries_msi_ops_prepare()'s allocation logic to match the
original logic in __pci_enable_msix_range()
- fix up Nilay's email address
---
arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index a82aaa786e9e..edc30cda5dbc 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -19,6 +19,11 @@
#include "pseries.h"
+struct pseries_msi_device {
+ unsigned int msi_quota;
+ unsigned int msi_used;
+};
+
static int query_token, change_token;
#define RTAS_QUERY_FN 0
@@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
struct msi_domain_info *info = domain->host_data;
struct pci_dev *pdev = to_pci_dev(dev);
int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+ int ret;
+
+ struct pseries_msi_device *pseries_dev __free(kfree)
+ = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
+ if (!pseries_dev)
+ return -ENOMEM;
+
+ while (1) {
+ ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ if (!ret)
+ break;
+ else if (ret > 0)
+ nvec = ret;
+ else
+ return ret;
+ }
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ pseries_dev->msi_quota = nvec;
+ pseries_dev->msi_used = 0;
+
+ arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
+ return 0;
}
/*
@@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
*/
static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
{
+ struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev);
+
+ WARN_ON(pseries_dev->msi_used);
+ kfree(pseries_dev);
}
static void pseries_msi_shutdown(struct irq_data *d)
@@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
unsigned int nr_irqs, void *arg)
{
struct pci_controller *phb = domain->host_data;
+ struct pseries_msi_device *pseries_dev;
msi_alloc_info_t *info = arg;
struct msi_desc *desc = info->desc;
struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
int hwirq;
int i, ret;
+ pseries_dev = info->scratchpad[0].ptr;
+
+ if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
+ return -ENOSPC;
+
hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
if (hwirq < 0) {
dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
goto out;
irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
- &pseries_msi_irq_chip, domain->host_data);
+ &pseries_msi_irq_chip, pseries_dev);
}
+ pseries_dev->msi_used++;
return 0;
out:
@@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
unsigned int nr_irqs)
{
struct irq_data *d = irq_domain_get_irq_data(domain, virq);
- struct pci_controller *phb = irq_data_get_irq_chip_data(d);
+ struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
+ struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
+ pseries_dev->msi_used -= nr_irqs;
irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
--
2.51.0
On Wed, 07 Jan 2026 10:02:30 +0000, Nam Cao wrote:
> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
> when the device's MSI-X table size is larger than the firmware's MSI quota
> for the device.
>
> This is because the commit changes how rtas_prepare_msi_irqs() is called:
>
> [...]
Applied to powerpc/next.
[1/1] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
https://git.kernel.org/powerpc/c/c0215e2d72debcd9cbc1c002fb012d50a3140387
cheers
On 1/7/26 3:32 PM, Nam Cao wrote:
> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
> when the device's MSI-X table size is larger than the firmware's MSI quota
> for the device.
>
> This is because the commit changes how rtas_prepare_msi_irqs() is called:
>
> - Before, it is called when interrupts are allocated at the global
> interrupt domain with nvec_in being the number of allocated interrupts.
> rtas_prepare_msi_irqs() can return a positive number and the allocation
> will be retried.
>
> - Now, it is called at the creation of per-device interrupt domain with
> nvec_in being the number of interrupts that the device supports. If
> rtas_prepare_msi_irqs() returns positive, domain creation just fails.
>
> For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
> number (the quota). This causes per-device interrupt domain creation to
> fail and thus the NVMe driver cannot enable MSI-X.
>
> Rework to make this scenario works again:
>
> - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
> permit.
>
> - pseries_irq_domain_alloc() fails if the device's quota is exceeded.
>
> Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
> as allowed by the quota. If device drivers attempt to allocate more
> interrupts than the quota permits, pseries_irq_domain_alloc() will return
> an error code and msi_handle_pci_fail() will allow device drivers a retry.
>
> Reported-by: Nilay Shroff <nilay@linux.ibm.com>
> Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
> Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
> Signed-off-by: Nam Cao <namcao@linutronix.de>
> Acked-by: Nilay Shroff <nilay@linux.ibm.com>
> Cc: stable@vger.kernel.org
> ---
> v2:
> - change pseries_msi_ops_prepare()'s allocation logic to match the
> original logic in __pci_enable_msix_range()
>
> - fix up Nilay's email address
> ---
> arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
> 1 file changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
> index a82aaa786e9e..edc30cda5dbc 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -19,6 +19,11 @@
>
> #include "pseries.h"
>
> +struct pseries_msi_device {
> + unsigned int msi_quota;
> + unsigned int msi_used;
> +};
> +
> static int query_token, change_token;
>
> #define RTAS_QUERY_FN 0
> @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
> struct msi_domain_info *info = domain->host_data;
> struct pci_dev *pdev = to_pci_dev(dev);
> int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
> + int ret;
> +
> + struct pseries_msi_device *pseries_dev __free(kfree)
> + = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
> + if (!pseries_dev)
> + return -ENOMEM;
> +
> + while (1) {
> + ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> + if (!ret)
> + break;
> + else if (ret > 0)
> + nvec = ret;
> + else
> + return ret;
> + }
>
> - return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> + pseries_dev->msi_quota = nvec;
> + pseries_dev->msi_used = 0;
> +
> + arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
> + return 0;
> }
>
> /*
> @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
> */
> static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
> {
> + struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
> struct pci_dev *pdev = to_pci_dev(domain->dev);
>
> rtas_disable_msi(pdev);
> +
> + WARN_ON(pseries_dev->msi_used);
> + kfree(pseries_dev);
> }
>
> static void pseries_msi_shutdown(struct irq_data *d)
> @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
> unsigned int nr_irqs, void *arg)
> {
> struct pci_controller *phb = domain->host_data;
> + struct pseries_msi_device *pseries_dev;
> msi_alloc_info_t *info = arg;
> struct msi_desc *desc = info->desc;
> struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
> int hwirq;
> int i, ret;
>
> + pseries_dev = info->scratchpad[0].ptr;
> +
> + if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
> + return -ENOSPC;
> +
> hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
> if (hwirq < 0) {
> dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
> @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
> goto out;
>
> irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
> - &pseries_msi_irq_chip, domain->host_data);
> + &pseries_msi_irq_chip, pseries_dev);
> }
>
> + pseries_dev->msi_used++;
> return 0;
>
> out:
> @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
> unsigned int nr_irqs)
> {
> struct irq_data *d = irq_domain_get_irq_data(domain, virq);
> - struct pci_controller *phb = irq_data_get_irq_chip_data(d);
> + struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
> + struct pci_controller *phb = domain->host_data;
>
> pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
> + pseries_dev->msi_used -= nr_irqs;
> irq_domain_free_irqs_parent(domain, virq, nr_irqs);
> }
>
I just tested this change on my system using the latest mainline kernel and it works
well for me. So with that, please fell free to add,
Tested-by: Nilay Shroff <nilay@linux.ibm.com>
On 1/7/26 7:55 PM, Nilay Shroff wrote:
>
> On 1/7/26 3:32 PM, Nam Cao wrote:
>> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
>> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
>> when the device's MSI-X table size is larger than the firmware's MSI quota
>> for the device.
>>
>> This is because the commit changes how rtas_prepare_msi_irqs() is called:
>>
>> - Before, it is called when interrupts are allocated at the global
>> interrupt domain with nvec_in being the number of allocated interrupts.
>> rtas_prepare_msi_irqs() can return a positive number and the allocation
>> will be retried.
>>
>> - Now, it is called at the creation of per-device interrupt domain with
>> nvec_in being the number of interrupts that the device supports. If
>> rtas_prepare_msi_irqs() returns positive, domain creation just fails.
>>
>> For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
>> number (the quota). This causes per-device interrupt domain creation to
>> fail and thus the NVMe driver cannot enable MSI-X.
>>
>> Rework to make this scenario works again:
>>
>> - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
>> permit.
>>
>> - pseries_irq_domain_alloc() fails if the device's quota is exceeded.
>>
>> Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
>> as allowed by the quota. If device drivers attempt to allocate more
>> interrupts than the quota permits, pseries_irq_domain_alloc() will return
>> an error code and msi_handle_pci_fail() will allow device drivers a retry.
>>
>> Reported-by: Nilay Shroff <nilay@linux.ibm.com>
>> Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
>> Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
>> Signed-off-by: Nam Cao <namcao@linutronix.de>
>> Acked-by: Nilay Shroff <nilay@linux.ibm.com>
>> Cc: stable@vger.kernel.org
>> ---
>> v2:
>> - change pseries_msi_ops_prepare()'s allocation logic to match the
>> original logic in __pci_enable_msix_range()
>>
>> - fix up Nilay's email address
>> ---
>> arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
>> 1 file changed, 41 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
>> index a82aaa786e9e..edc30cda5dbc 100644
>> --- a/arch/powerpc/platforms/pseries/msi.c
>> +++ b/arch/powerpc/platforms/pseries/msi.c
>> @@ -19,6 +19,11 @@
>>
>> #include "pseries.h"
>>
>> +struct pseries_msi_device {
>> + unsigned int msi_quota;
>> + unsigned int msi_used;
>> +};
>> +
>> static int query_token, change_token;
>>
>> #define RTAS_QUERY_FN 0
>> @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>> struct msi_domain_info *info = domain->host_data;
>> struct pci_dev *pdev = to_pci_dev(dev);
>> int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
>> + int ret;
>> +
>> + struct pseries_msi_device *pseries_dev __free(kfree)
>> + = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
>> + if (!pseries_dev)
>> + return -ENOMEM;
>> +
>> + while (1) {
>> + ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
>> + if (!ret)
>> + break;
>> + else if (ret > 0)
>> + nvec = ret;
>> + else
>> + return ret;
>> + }
>>
>> - return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
>> + pseries_dev->msi_quota = nvec;
>> + pseries_dev->msi_used = 0;
>> +
>> + arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
>> + return 0;
>> }
>>
>> /*
>> @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>> */
>> static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
>> {
>> + struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
>> struct pci_dev *pdev = to_pci_dev(domain->dev);
>>
>> rtas_disable_msi(pdev);
>> +
>> + WARN_ON(pseries_dev->msi_used);
>> + kfree(pseries_dev);
>> }
>>
>> static void pseries_msi_shutdown(struct irq_data *d)
>> @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>> unsigned int nr_irqs, void *arg)
>> {
>> struct pci_controller *phb = domain->host_data;
>> + struct pseries_msi_device *pseries_dev;
>> msi_alloc_info_t *info = arg;
>> struct msi_desc *desc = info->desc;
>> struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
>> int hwirq;
>> int i, ret;
>>
>> + pseries_dev = info->scratchpad[0].ptr;
>> +
>> + if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
>> + return -ENOSPC;
>> +
>> hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
>> if (hwirq < 0) {
>> dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
>> @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>> goto out;
>>
>> irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
>> - &pseries_msi_irq_chip, domain->host_data);
>> + &pseries_msi_irq_chip, pseries_dev);
>> }
>>
>> + pseries_dev->msi_used++;
>> return 0;
>>
>> out:
>> @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
>> unsigned int nr_irqs)
>> {
>> struct irq_data *d = irq_domain_get_irq_data(domain, virq);
>> - struct pci_controller *phb = irq_data_get_irq_chip_data(d);
>> + struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
>> + struct pci_controller *phb = domain->host_data;
>>
>> pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
>> + pseries_dev->msi_used -= nr_irqs;
>> irq_domain_free_irqs_parent(domain, virq, nr_irqs);
>> }
>>
> I just tested this change on my system using the latest mainline kernel and it works
> well for me. So with that, please fell free to add,
>
> Tested-by: Nilay Shroff <nilay@linux.ibm.com>
Thanks will pull this in.
Maddy
>
© 2016 - 2026 Red Hat, Inc.