[PATCH v2] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded

Nam Cao posted 1 patch 1 month ago
arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
1 file changed, 41 insertions(+), 3 deletions(-)
[PATCH v2] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
Posted by Nam Cao 1 month ago
Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
when the device's MSI-X table size is larger than the firmware's MSI quota
for the device.

This is because the commit changes how rtas_prepare_msi_irqs() is called:

  - Before, it is called when interrupts are allocated at the global
    interrupt domain with nvec_in being the number of allocated interrupts.
    rtas_prepare_msi_irqs() can return a positive number and the allocation
    will be retried.

  - Now, it is called at the creation of per-device interrupt domain with
    nvec_in being the number of interrupts that the device supports. If
    rtas_prepare_msi_irqs() returns positive, domain creation just fails.

For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
number (the quota). This causes per-device interrupt domain creation to
fail and thus the NVMe driver cannot enable MSI-X.

Rework to make this scenario works again:

  - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
    permit.

  - pseries_irq_domain_alloc() fails if the device's quota is exceeded.

Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
as allowed by the quota. If device drivers attempt to allocate more
interrupts than the quota permits, pseries_irq_domain_alloc() will return
an error code and msi_handle_pci_fail() will allow device drivers a retry.

Reported-by: Nilay Shroff <nilay@linux.ibm.com>
Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
Signed-off-by: Nam Cao <namcao@linutronix.de>
Acked-by: Nilay Shroff <nilay@linux.ibm.com>
Cc: stable@vger.kernel.org
---
v2:
  - change pseries_msi_ops_prepare()'s allocation logic to match the
    original logic in __pci_enable_msix_range()

  - fix up Nilay's email address
---
 arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index a82aaa786e9e..edc30cda5dbc 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -19,6 +19,11 @@
 
 #include "pseries.h"
 
+struct pseries_msi_device {
+	unsigned int msi_quota;
+	unsigned int msi_used;
+};
+
 static int query_token, change_token;
 
 #define RTAS_QUERY_FN		0
@@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
 	struct msi_domain_info *info = domain->host_data;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+	int ret;
+
+	struct pseries_msi_device *pseries_dev __free(kfree)
+		= kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
+	if (!pseries_dev)
+		return -ENOMEM;
+
+	while (1) {
+		ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+		if (!ret)
+			break;
+		else if (ret > 0)
+			nvec = ret;
+		else
+			return ret;
+	}
 
-	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+	pseries_dev->msi_quota = nvec;
+	pseries_dev->msi_used = 0;
+
+	arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
+	return 0;
 }
 
 /*
@@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
  */
 static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
 {
+	struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
 	struct pci_dev *pdev = to_pci_dev(domain->dev);
 
 	rtas_disable_msi(pdev);
+
+	WARN_ON(pseries_dev->msi_used);
+	kfree(pseries_dev);
 }
 
 static void pseries_msi_shutdown(struct irq_data *d)
@@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 				    unsigned int nr_irqs, void *arg)
 {
 	struct pci_controller *phb = domain->host_data;
+	struct pseries_msi_device *pseries_dev;
 	msi_alloc_info_t *info = arg;
 	struct msi_desc *desc = info->desc;
 	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
 	int hwirq;
 	int i, ret;
 
+	pseries_dev = info->scratchpad[0].ptr;
+
+	if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
+		return -ENOSPC;
+
 	hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
 	if (hwirq < 0) {
 		dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 			goto out;
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
-					      &pseries_msi_irq_chip, domain->host_data);
+					      &pseries_msi_irq_chip, pseries_dev);
 	}
 
+	pseries_dev->msi_used++;
 	return 0;
 
 out:
@@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
 				    unsigned int nr_irqs)
 {
 	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
-	struct pci_controller *phb = irq_data_get_irq_chip_data(d);
+	struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
+	struct pci_controller *phb = domain->host_data;
 
 	pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
+	pseries_dev->msi_used -= nr_irqs;
 	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 
-- 
2.51.0
Re: [PATCH v2] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
Posted by Madhavan Srinivasan 3 weeks, 4 days ago
On Wed, 07 Jan 2026 10:02:30 +0000, Nam Cao wrote:
> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
> when the device's MSI-X table size is larger than the firmware's MSI quota
> for the device.
> 
> This is because the commit changes how rtas_prepare_msi_irqs() is called:
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
      https://git.kernel.org/powerpc/c/c0215e2d72debcd9cbc1c002fb012d50a3140387

cheers
Re: [PATCH v2] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
Posted by Nilay Shroff 1 month ago

On 1/7/26 3:32 PM, Nam Cao wrote:
> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
> when the device's MSI-X table size is larger than the firmware's MSI quota
> for the device.
> 
> This is because the commit changes how rtas_prepare_msi_irqs() is called:
> 
>   - Before, it is called when interrupts are allocated at the global
>     interrupt domain with nvec_in being the number of allocated interrupts.
>     rtas_prepare_msi_irqs() can return a positive number and the allocation
>     will be retried.
> 
>   - Now, it is called at the creation of per-device interrupt domain with
>     nvec_in being the number of interrupts that the device supports. If
>     rtas_prepare_msi_irqs() returns positive, domain creation just fails.
> 
> For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
> number (the quota). This causes per-device interrupt domain creation to
> fail and thus the NVMe driver cannot enable MSI-X.
> 
> Rework to make this scenario works again:
> 
>   - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
>     permit.
> 
>   - pseries_irq_domain_alloc() fails if the device's quota is exceeded.
> 
> Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
> as allowed by the quota. If device drivers attempt to allocate more
> interrupts than the quota permits, pseries_irq_domain_alloc() will return
> an error code and msi_handle_pci_fail() will allow device drivers a retry.
> 
> Reported-by: Nilay Shroff <nilay@linux.ibm.com>
> Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
> Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
> Signed-off-by: Nam Cao <namcao@linutronix.de>
> Acked-by: Nilay Shroff <nilay@linux.ibm.com>
> Cc: stable@vger.kernel.org
> ---
> v2:
>   - change pseries_msi_ops_prepare()'s allocation logic to match the
>     original logic in __pci_enable_msix_range()
> 
>   - fix up Nilay's email address
> ---
>  arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
>  1 file changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
> index a82aaa786e9e..edc30cda5dbc 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -19,6 +19,11 @@
>  
>  #include "pseries.h"
>  
> +struct pseries_msi_device {
> +	unsigned int msi_quota;
> +	unsigned int msi_used;
> +};
> +
>  static int query_token, change_token;
>  
>  #define RTAS_QUERY_FN		0
> @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>  	struct msi_domain_info *info = domain->host_data;
>  	struct pci_dev *pdev = to_pci_dev(dev);
>  	int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
> +	int ret;
> +
> +	struct pseries_msi_device *pseries_dev __free(kfree)
> +		= kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
> +	if (!pseries_dev)
> +		return -ENOMEM;
> +
> +	while (1) {
> +		ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> +		if (!ret)
> +			break;
> +		else if (ret > 0)
> +			nvec = ret;
> +		else
> +			return ret;
> +	}
>  
> -	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> +	pseries_dev->msi_quota = nvec;
> +	pseries_dev->msi_used = 0;
> +
> +	arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
> +	return 0;
>  }
>  
>  /*
> @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>   */
>  static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
>  {
> +	struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
>  	struct pci_dev *pdev = to_pci_dev(domain->dev);
>  
>  	rtas_disable_msi(pdev);
> +
> +	WARN_ON(pseries_dev->msi_used);
> +	kfree(pseries_dev);
>  }
>  
>  static void pseries_msi_shutdown(struct irq_data *d)
> @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>  				    unsigned int nr_irqs, void *arg)
>  {
>  	struct pci_controller *phb = domain->host_data;
> +	struct pseries_msi_device *pseries_dev;
>  	msi_alloc_info_t *info = arg;
>  	struct msi_desc *desc = info->desc;
>  	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
>  	int hwirq;
>  	int i, ret;
>  
> +	pseries_dev = info->scratchpad[0].ptr;
> +
> +	if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
> +		return -ENOSPC;
> +
>  	hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
>  	if (hwirq < 0) {
>  		dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
> @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>  			goto out;
>  
>  		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
> -					      &pseries_msi_irq_chip, domain->host_data);
> +					      &pseries_msi_irq_chip, pseries_dev);
>  	}
>  
> +	pseries_dev->msi_used++;
>  	return 0;
>  
>  out:
> @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
>  				    unsigned int nr_irqs)
>  {
>  	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
> -	struct pci_controller *phb = irq_data_get_irq_chip_data(d);
> +	struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
> +	struct pci_controller *phb = domain->host_data;
>  
>  	pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
> +	pseries_dev->msi_used -= nr_irqs;
>  	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
>  }
>  

I just tested this change on my system using the latest mainline kernel and it works
well for me. So with that, please fell free to add,

Tested-by: Nilay Shroff <nilay@linux.ibm.com>
Re: [PATCH v2] powerpc/pseries: Fix MSI-X allocation failure when quota is exceeded
Posted by Madhavan Srinivasan 1 month ago
On 1/7/26 7:55 PM, Nilay Shroff wrote:
>
> On 1/7/26 3:32 PM, Nam Cao wrote:
>> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
>> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
>> when the device's MSI-X table size is larger than the firmware's MSI quota
>> for the device.
>>
>> This is because the commit changes how rtas_prepare_msi_irqs() is called:
>>
>>    - Before, it is called when interrupts are allocated at the global
>>      interrupt domain with nvec_in being the number of allocated interrupts.
>>      rtas_prepare_msi_irqs() can return a positive number and the allocation
>>      will be retried.
>>
>>    - Now, it is called at the creation of per-device interrupt domain with
>>      nvec_in being the number of interrupts that the device supports. If
>>      rtas_prepare_msi_irqs() returns positive, domain creation just fails.
>>
>> For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
>> number (the quota). This causes per-device interrupt domain creation to
>> fail and thus the NVMe driver cannot enable MSI-X.
>>
>> Rework to make this scenario works again:
>>
>>    - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
>>      permit.
>>
>>    - pseries_irq_domain_alloc() fails if the device's quota is exceeded.
>>
>> Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
>> as allowed by the quota. If device drivers attempt to allocate more
>> interrupts than the quota permits, pseries_irq_domain_alloc() will return
>> an error code and msi_handle_pci_fail() will allow device drivers a retry.
>>
>> Reported-by: Nilay Shroff <nilay@linux.ibm.com>
>> Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@linux.ibm.com/
>> Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
>> Signed-off-by: Nam Cao <namcao@linutronix.de>
>> Acked-by: Nilay Shroff <nilay@linux.ibm.com>
>> Cc: stable@vger.kernel.org
>> ---
>> v2:
>>    - change pseries_msi_ops_prepare()'s allocation logic to match the
>>      original logic in __pci_enable_msix_range()
>>
>>    - fix up Nilay's email address
>> ---
>>   arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
>>   1 file changed, 41 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
>> index a82aaa786e9e..edc30cda5dbc 100644
>> --- a/arch/powerpc/platforms/pseries/msi.c
>> +++ b/arch/powerpc/platforms/pseries/msi.c
>> @@ -19,6 +19,11 @@
>>   
>>   #include "pseries.h"
>>   
>> +struct pseries_msi_device {
>> +	unsigned int msi_quota;
>> +	unsigned int msi_used;
>> +};
>> +
>>   static int query_token, change_token;
>>   
>>   #define RTAS_QUERY_FN		0
>> @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>>   	struct msi_domain_info *info = domain->host_data;
>>   	struct pci_dev *pdev = to_pci_dev(dev);
>>   	int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
>> +	int ret;
>> +
>> +	struct pseries_msi_device *pseries_dev __free(kfree)
>> +		= kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
>> +	if (!pseries_dev)
>> +		return -ENOMEM;
>> +
>> +	while (1) {
>> +		ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
>> +		if (!ret)
>> +			break;
>> +		else if (ret > 0)
>> +			nvec = ret;
>> +		else
>> +			return ret;
>> +	}
>>   
>> -	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
>> +	pseries_dev->msi_quota = nvec;
>> +	pseries_dev->msi_used = 0;
>> +
>> +	arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
>> +	return 0;
>>   }
>>   
>>   /*
>> @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>>    */
>>   static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
>>   {
>> +	struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
>>   	struct pci_dev *pdev = to_pci_dev(domain->dev);
>>   
>>   	rtas_disable_msi(pdev);
>> +
>> +	WARN_ON(pseries_dev->msi_used);
>> +	kfree(pseries_dev);
>>   }
>>   
>>   static void pseries_msi_shutdown(struct irq_data *d)
>> @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>>   				    unsigned int nr_irqs, void *arg)
>>   {
>>   	struct pci_controller *phb = domain->host_data;
>> +	struct pseries_msi_device *pseries_dev;
>>   	msi_alloc_info_t *info = arg;
>>   	struct msi_desc *desc = info->desc;
>>   	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
>>   	int hwirq;
>>   	int i, ret;
>>   
>> +	pseries_dev = info->scratchpad[0].ptr;
>> +
>> +	if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
>> +		return -ENOSPC;
>> +
>>   	hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
>>   	if (hwirq < 0) {
>>   		dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
>> @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
>>   			goto out;
>>   
>>   		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
>> -					      &pseries_msi_irq_chip, domain->host_data);
>> +					      &pseries_msi_irq_chip, pseries_dev);
>>   	}
>>   
>> +	pseries_dev->msi_used++;
>>   	return 0;
>>   
>>   out:
>> @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
>>   				    unsigned int nr_irqs)
>>   {
>>   	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
>> -	struct pci_controller *phb = irq_data_get_irq_chip_data(d);
>> +	struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
>> +	struct pci_controller *phb = domain->host_data;
>>   
>>   	pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
>> +	pseries_dev->msi_used -= nr_irqs;
>>   	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
>>   }
>>   
> I just tested this change on my system using the latest mainline kernel and it works
> well for me. So with that, please fell free to add,
>
> Tested-by: Nilay Shroff <nilay@linux.ibm.com>

Thanks will pull this in.
Maddy

>