[PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug

Timothy Pearson posted 6 patches 2 months, 3 weeks ago
[PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug
Posted by Timothy Pearson 2 months, 3 weeks ago
In cases where the root of a nested PCIe bridge configuration is
unplugged, the pnv_php driver would leak the allocated IRQ resources for
the child bridges' hotplug event notifications, resulting in a panic.
Fix this by walking all child buses and deallocating all it's IRQ
resources before calling pci_hp_remove_devices.

Also modify the lifetime of the workqueue at struct pnv_php_slot::wq so
that it is only destroyed in pnv_php_free_slot, instead of
pnv_php_disable_irq. This is required since pnv_php_disable_irq will now
be called by workers triggered by hot unplug interrupts, so the
workqueue needs to stay allocated.

The abridged kernel panic that occurs without this patch is as follows:

  WARNING: CPU: 0 PID: 687 at kernel/irq/msi.c:292 msi_device_data_release+0x6c/0x9c
  CPU: 0 UID: 0 PID: 687 Comm: bash Not tainted 6.14.0-rc5+ #2
  Call Trace:
   msi_device_data_release+0x34/0x9c (unreliable)
   release_nodes+0x64/0x13c
   devres_release_all+0xc0/0x140
   device_del+0x2d4/0x46c
   pci_destroy_dev+0x5c/0x194
   pci_hp_remove_devices+0x90/0x128
   pci_hp_remove_devices+0x44/0x128
   pnv_php_disable_slot+0x54/0xd4
   power_write_file+0xf8/0x18c
   pci_slot_attr_store+0x40/0x5c
   sysfs_kf_write+0x64/0x78
   kernfs_fop_write_iter+0x1b0/0x290
   vfs_write+0x3bc/0x50c
   ksys_write+0x84/0x140
   system_call_exception+0x124/0x230
   system_call_vectored_common+0x15c/0x2ec

Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
 drivers/pci/hotplug/pnv_php.c | 94 ++++++++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 19 deletions(-)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 573a41869c15..aec0a6d594ac 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -3,6 +3,7 @@
  * PCI Hotplug Driver for PowerPC PowerNV platform.
  *
  * Copyright Gavin Shan, IBM Corporation 2016.
+ * Copyright (C) 2025 Raptor Engineering, LLC
  */
 
 #include <linux/bitfield.h>
@@ -36,8 +37,10 @@ static void pnv_php_register(struct device_node *dn);
 static void pnv_php_unregister_one(struct device_node *dn);
 static void pnv_php_unregister(struct device_node *dn);
 
+static void pnv_php_enable_irq(struct pnv_php_slot *php_slot);
+
 static void pnv_php_disable_irq(struct pnv_php_slot *php_slot,
-				bool disable_device)
+				bool disable_device, bool disable_msi)
 {
 	struct pci_dev *pdev = php_slot->pdev;
 	u16 ctrl;
@@ -53,19 +56,15 @@ static void pnv_php_disable_irq(struct pnv_php_slot *php_slot,
 		php_slot->irq = 0;
 	}
 
-	if (php_slot->wq) {
-		destroy_workqueue(php_slot->wq);
-		php_slot->wq = NULL;
-	}
-
-	if (disable_device) {
+	if (disable_device || disable_msi) {
 		if (pdev->msix_enabled)
 			pci_disable_msix(pdev);
 		else if (pdev->msi_enabled)
 			pci_disable_msi(pdev);
+	}
 
+	if (disable_device)
 		pci_disable_device(pdev);
-	}
 }
 
 static void pnv_php_free_slot(struct kref *kref)
@@ -74,7 +73,8 @@ static void pnv_php_free_slot(struct kref *kref)
 					struct pnv_php_slot, kref);
 
 	WARN_ON(!list_empty(&php_slot->children));
-	pnv_php_disable_irq(php_slot, false);
+	pnv_php_disable_irq(php_slot, false, false);
+	destroy_workqueue(php_slot->wq);
 	kfree(php_slot->name);
 	kfree(php_slot);
 }
@@ -561,8 +561,57 @@ static int pnv_php_reset_slot(struct hotplug_slot *slot, bool probe)
 static int pnv_php_enable_slot(struct hotplug_slot *slot)
 {
 	struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
+	u32 prop32;
+	int ret;
+
+	ret = pnv_php_enable(php_slot, true);
+	if (ret)
+		return ret;
+
+	/* (Re-)enable interrupt if the slot supports surprise hotplug */
+	ret = of_property_read_u32(php_slot->dn, "ibm,slot-surprise-pluggable", &prop32);
+	if (!ret && prop32)
+		pnv_php_enable_irq(php_slot);
+
+	return 0;
+}
+
+/**
+ * Disable any hotplug interrupts for all slots on the provided bus, as well as
+ * all downstream slots in preparation for a hot unplug.
+ */
+static int pnv_php_disable_all_irqs(struct pci_bus *bus)
+{
+	struct pci_bus *child_bus;
+	struct pci_slot *cur_slot;
+
+	/* First go down child busses */
+	list_for_each_entry(child_bus, &bus->children, node)
+		pnv_php_disable_all_irqs(child_bus);
+
+	/* Disable IRQs for all pnv_php slots on this bus */
+	list_for_each_entry(cur_slot, &bus->slots, list) {
+		struct pnv_php_slot *php_slot = to_pnv_php_slot(cur_slot->hotplug);
+
+		pnv_php_disable_irq(php_slot, false, true);
+	}
 
-	return pnv_php_enable(php_slot, true);
+	return 0;
+}
+
+/**
+ * Disable any hotplug interrupts for all downstream slots on the provided bus in
+ * preparation for a hot unplug.
+ */
+static int pnv_php_disable_all_downstream_irqs(struct pci_bus *bus)
+{
+	struct pci_bus *child_bus;
+
+	/* Go down child busses, recursively deactivating their IRQs */
+	list_for_each_entry(child_bus, &bus->children, node)
+		pnv_php_disable_all_irqs(child_bus);
+
+	return 0;
 }
 
 static int pnv_php_disable_slot(struct hotplug_slot *slot)
@@ -579,6 +628,12 @@ static int pnv_php_disable_slot(struct hotplug_slot *slot)
 	    php_slot->state != PNV_PHP_STATE_REGISTERED)
 		return 0;
 
+	/* Free all IRQ resources from all child slots before remove.
+	 * Note that we do not disable the root slot IRQ here as that
+	 * would also deactivate the slot hot (re)plug interrupt!
+	 */
+	pnv_php_disable_all_downstream_irqs(php_slot->bus);
+
 	/* Remove all devices behind the slot */
 	pci_lock_rescan_remove();
 	pci_hp_remove_devices(php_slot->bus);
@@ -647,6 +702,15 @@ static struct pnv_php_slot *pnv_php_alloc_slot(struct device_node *dn)
 		return NULL;
 	}
 
+	/* Allocate workqueue for this slot's interrupt handling */
+	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
+	if (!php_slot->wq) {
+		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");
+		kfree(php_slot->name);
+		kfree(php_slot);
+		return NULL;
+	}
+
 	if (dn->child && PCI_DN(dn->child))
 		php_slot->slot_no = PCI_SLOT(PCI_DN(dn->child)->devfn);
 	else
@@ -843,14 +907,6 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq)
 	u16 sts, ctrl;
 	int ret;
 
-	/* Allocate workqueue */
-	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
-	if (!php_slot->wq) {
-		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");
-		pnv_php_disable_irq(php_slot, true);
-		return;
-	}
-
 	/* Check PDC (Presence Detection Change) is broken or not */
 	ret = of_property_read_u32(php_slot->dn, "ibm,slot-broken-pdc",
 				   &broken_pdc);
@@ -869,7 +925,7 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq)
 	ret = request_irq(irq, pnv_php_interrupt, IRQF_SHARED,
 			  php_slot->name, php_slot);
 	if (ret) {
-		pnv_php_disable_irq(php_slot, true);
+		pnv_php_disable_irq(php_slot, true, true);
 		SLOT_WARN(php_slot, "Error %d enabling IRQ %d\n", ret, irq);
 		return;
 	}
-- 
2.39.5
Re: [PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug
Posted by Jiri Slaby 3 weeks, 6 days ago
On 15. 07. 25, 23:36, Timothy Pearson wrote:
> In cases where the root of a nested PCIe bridge configuration is
> unplugged, the pnv_php driver would leak the allocated IRQ resources for
> the child bridges' hotplug event notifications, resulting in a panic.
> Fix this by walking all child buses and deallocating all it's IRQ
> resources before calling pci_hp_remove_devices.
> 
> Also modify the lifetime of the workqueue at struct pnv_php_slot::wq so
> that it is only destroyed in pnv_php_free_slot, instead of
> pnv_php_disable_irq. This is required since pnv_php_disable_irq will now
> be called by workers triggered by hot unplug interrupts, so the
> workqueue needs to stay allocated.
> 
> The abridged kernel panic that occurs without this patch is as follows:
> 
>    WARNING: CPU: 0 PID: 687 at kernel/irq/msi.c:292 msi_device_data_release+0x6c/0x9c
>    CPU: 0 UID: 0 PID: 687 Comm: bash Not tainted 6.14.0-rc5+ #2
>    Call Trace:
>     msi_device_data_release+0x34/0x9c (unreliable)
>     release_nodes+0x64/0x13c
>     devres_release_all+0xc0/0x140
>     device_del+0x2d4/0x46c
>     pci_destroy_dev+0x5c/0x194
>     pci_hp_remove_devices+0x90/0x128
>     pci_hp_remove_devices+0x44/0x128
>     pnv_php_disable_slot+0x54/0xd4
>     power_write_file+0xf8/0x18c
>     pci_slot_attr_store+0x40/0x5c
>     sysfs_kf_write+0x64/0x78
>     kernfs_fop_write_iter+0x1b0/0x290
>     vfs_write+0x3bc/0x50c
>     ksys_write+0x84/0x140
>     system_call_exception+0x124/0x230
>     system_call_vectored_common+0x15c/0x2ec
> 
> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
> Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
> ---
>   drivers/pci/hotplug/pnv_php.c | 94 ++++++++++++++++++++++++++++-------
>   1 file changed, 75 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
> index 573a41869c15..aec0a6d594ac 100644
> --- a/drivers/pci/hotplug/pnv_php.c
> +++ b/drivers/pci/hotplug/pnv_php.c
...
> @@ -647,6 +702,15 @@ static struct pnv_php_slot *pnv_php_alloc_slot(struct device_node *dn)

This is preceded by:
         php_slot = kzalloc(sizeof(*php_slot), GFP_KERNEL);

Ie. php_slot is zeroed.

>   		return NULL;
>   	}
>   
> +	/* Allocate workqueue for this slot's interrupt handling */
> +	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
> +	if (!php_slot->wq) {
> +		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");

I believe this introduced a (unlikely) NULL ptr dereference.

> +		kfree(php_slot->name);
> +		kfree(php_slot);
> +		return NULL;
> +	}
> +
>   	if (dn->child && PCI_DN(dn->child))
>   		php_slot->slot_no = PCI_SLOT(PCI_DN(dn->child)->devfn);
>   	else

This continues:
         php_slot->pdev                  = bus->self;
         php_slot->bus                   = bus;


And SLOT_WARN() is defined as:
#define SLOT_WARN(sl, x...) \
         ((sl)->pdev ? pci_warn((sl)->pdev, x) : 
dev_warn(&(sl)->bus->dev, x))

The else branch is alkays taken in the 'if' above, which still 
dereferences NULLed (sl)->bus here.

> @@ -843,14 +907,6 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot, int irq)
>   	u16 sts, ctrl;
>   	int ret;
>   
> -	/* Allocate workqueue */
> -	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
> -	if (!php_slot->wq) {
> -		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");

Here, php_slot used to have both ->pdev and ->bus assigned at this point.

> -		pnv_php_disable_irq(php_slot, true);
> -		return;
> -	}
> -

Right?

thanks,
-- 
js
suse labs
Re: [PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug
Posted by Timothy Pearson 3 weeks, 6 days ago

----- Original Message -----
> From: "Jiri Slaby" <jirislaby@kernel.org>
> To: "Timothy Pearson" <tpearson@raptorengineering.com>
> Cc: "linuxppc-dev" <linuxppc-dev@lists.ozlabs.org>, "linux-kernel" <linux-kernel@vger.kernel.org>, "linux-pci"
> <linux-pci@vger.kernel.org>, "Madhavan Srinivasan" <maddy@linux.ibm.com>, "Michael Ellerman" <mpe@ellerman.id.au>,
> "christophe leroy" <christophe.leroy@csgroup.eu>, "Naveen N Rao" <naveen@kernel.org>, "Bjorn Helgaas"
> <bhelgaas@google.com>, "Shawn Anastasio" <sanastasio@raptorengineering.com>
> Sent: Tuesday, September 9, 2025 4:00:41 AM
> Subject: Re: [PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug

> On 15. 07. 25, 23:36, Timothy Pearson wrote:
>> In cases where the root of a nested PCIe bridge configuration is
>> unplugged, the pnv_php driver would leak the allocated IRQ resources for
>> the child bridges' hotplug event notifications, resulting in a panic.
>> Fix this by walking all child buses and deallocating all it's IRQ
>> resources before calling pci_hp_remove_devices.
>> 
>> Also modify the lifetime of the workqueue at struct pnv_php_slot::wq so
>> that it is only destroyed in pnv_php_free_slot, instead of
>> pnv_php_disable_irq. This is required since pnv_php_disable_irq will now
>> be called by workers triggered by hot unplug interrupts, so the
>> workqueue needs to stay allocated.
>> 
>> The abridged kernel panic that occurs without this patch is as follows:
>> 
>>    WARNING: CPU: 0 PID: 687 at kernel/irq/msi.c:292
>>    msi_device_data_release+0x6c/0x9c
>>    CPU: 0 UID: 0 PID: 687 Comm: bash Not tainted 6.14.0-rc5+ #2
>>    Call Trace:
>>     msi_device_data_release+0x34/0x9c (unreliable)
>>     release_nodes+0x64/0x13c
>>     devres_release_all+0xc0/0x140
>>     device_del+0x2d4/0x46c
>>     pci_destroy_dev+0x5c/0x194
>>     pci_hp_remove_devices+0x90/0x128
>>     pci_hp_remove_devices+0x44/0x128
>>     pnv_php_disable_slot+0x54/0xd4
>>     power_write_file+0xf8/0x18c
>>     pci_slot_attr_store+0x40/0x5c
>>     sysfs_kf_write+0x64/0x78
>>     kernfs_fop_write_iter+0x1b0/0x290
>>     vfs_write+0x3bc/0x50c
>>     ksys_write+0x84/0x140
>>     system_call_exception+0x124/0x230
>>     system_call_vectored_common+0x15c/0x2ec
>> 
>> Signed-off-by: Shawn Anastasio <sanastasio@raptorengineering.com>
>> Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
>> ---
>>   drivers/pci/hotplug/pnv_php.c | 94 ++++++++++++++++++++++++++++-------
>>   1 file changed, 75 insertions(+), 19 deletions(-)
>> 
>> diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
>> index 573a41869c15..aec0a6d594ac 100644
>> --- a/drivers/pci/hotplug/pnv_php.c
>> +++ b/drivers/pci/hotplug/pnv_php.c
> ...
>> @@ -647,6 +702,15 @@ static struct pnv_php_slot *pnv_php_alloc_slot(struct
>> device_node *dn)
> 
> This is preceded by:
>         php_slot = kzalloc(sizeof(*php_slot), GFP_KERNEL);
> 
> Ie. php_slot is zeroed.
> 
>>   		return NULL;
>>   	}
>>   
>> +	/* Allocate workqueue for this slot's interrupt handling */
>> +	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
>> +	if (!php_slot->wq) {
>> +		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");
> 
> I believe this introduced a (unlikely) NULL ptr dereference.
> 
>> +		kfree(php_slot->name);
>> +		kfree(php_slot);
>> +		return NULL;
>> +	}
>> +
>>   	if (dn->child && PCI_DN(dn->child))
>>   		php_slot->slot_no = PCI_SLOT(PCI_DN(dn->child)->devfn);
>>   	else
> 
> This continues:
>         php_slot->pdev                  = bus->self;
>         php_slot->bus                   = bus;
> 
> 
> And SLOT_WARN() is defined as:
> #define SLOT_WARN(sl, x...) \
>         ((sl)->pdev ? pci_warn((sl)->pdev, x) :
> dev_warn(&(sl)->bus->dev, x))
> 
> The else branch is alkays taken in the 'if' above, which still
> dereferences NULLed (sl)->bus here.
> 
>> @@ -843,14 +907,6 @@ static void pnv_php_init_irq(struct pnv_php_slot *php_slot,
>> int irq)
>>   	u16 sts, ctrl;
>>   	int ret;
>>   
>> -	/* Allocate workqueue */
>> -	php_slot->wq = alloc_workqueue("pciehp-%s", 0, 0, php_slot->name);
>> -	if (!php_slot->wq) {
>> -		SLOT_WARN(php_slot, "Cannot alloc workqueue\n");
> 
> Here, php_slot used to have both ->pdev and ->bus assigned at this point.
> 
>> -		pnv_php_disable_irq(php_slot, true);
>> -		return;
>> -	}
>> -
> 
> Right?

That does look like an unlikely but definitely possible dereference -- good catch!

I can submit a patch to change

SLOT_WARN(php_slot, "Cannot alloc workqueue\n");

to

dev_warn(bus->dev, "Cannot alloc workqueue\n");

Would that work?