[REGRESSION 04/04] Crash during resume of pcie bridge

Bert Karwatzki posted 4 patches 2 months, 1 week ago
[REGRESSION 04/04] Crash during resume of pcie bridge
Posted by Bert Karwatzki 2 months, 1 week ago
To further close in on the crash we'll continue testing with 
6.17.0-rc6-next-20250917-gpudebug-00029-ge797f42363d1
which adds more dev_info()s to the critical part of rpm_resume() and removes some
unneeded ones:

commit e797f42363d101b146971ec4d7e6c90bcc4064cd
Author: Bert Karwatzki <spasswolf@web.de>
Date:   Mon Oct 6 12:17:16 2025 +0200

    power: runtime: and more dev_info()s to rpm_resume()
    
    Signed-off-by: Bert Karwatzki <spasswolf@web.de>

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 27cce7f1b1d3..c99dac998047 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -793,12 +793,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
  repeat:
 	if (dev->power.runtime_error) {
-		if (!strcmp(dev_name(dev), "0000:00:01.1"))
-			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		retval = -EINVAL;
 	} else if (dev->power.disable_depth > 0) {
-		if (!strcmp(dev_name(dev), "0000:00:01.1"))
-			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		if (dev->power.runtime_status == RPM_ACTIVE &&
 		    dev->power.last_status == RPM_ACTIVE)
 			retval = 1;
@@ -887,32 +883,22 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	 * the resume will actually succeed.
 	 */
 	if (dev->power.no_callbacks && !parent && dev->parent) {
-		if (!strcmp(dev_name(dev), "0000:00:01.1"))
-			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
 		if (dev->parent->power.disable_depth > 0 ||
 		    dev->parent->power.ignore_children ||
 		    dev->parent->power.runtime_status == RPM_ACTIVE) {
-			if (!strcmp(dev_name(dev), "0000:00:01.1"))
-				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			atomic_inc(&dev->parent->power.child_count);
 			spin_unlock(&dev->parent->power.lock);
 			retval = 1;
 			goto no_callback;	/* Assume success. */
 		}
 		spin_unlock(&dev->parent->power.lock);
-		if (!strcmp(dev_name(dev), "0000:00:01.1"))
-			dev_info(dev, "%s %d\n", __func__, __LINE__);
 	}
 
 	/* Carry out an asynchronous or a synchronous resume. */
 	if (rpmflags & RPM_ASYNC) {
-		if (!strcmp(dev_name(dev), "0000:00:01.1"))
-			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		dev->power.request = RPM_REQ_RESUME;
 		if (!dev->power.request_pending) {
-			if (!strcmp(dev_name(dev), "0000:00:01.1"))
-				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			dev->power.request_pending = true;
 			queue_work(pm_wq, &dev->power.work);
 		}
@@ -929,8 +915,11 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		if (!strcmp(dev_name(dev), "0000:00:01.1"))
 			dev_info(dev, "%s %d\n", __func__, __LINE__);
 		parent = dev->parent;
-		if (dev->power.irq_safe)
+		if (dev->power.irq_safe) {
+			if (!strcmp(dev_name(dev), "0000:00:01.1"))
+				dev_info(dev, "%s %d\n", __func__, __LINE__);
 			goto skip_parent;
+		}
 
 		spin_unlock(&dev->power.lock);
 
@@ -966,12 +955,22 @@ static int rpm_resume(struct device *dev, int rpmflags)
 	if (dev->power.no_callbacks)
 		goto no_callback;	/* Assume success. */
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	__update_runtime_status(dev, RPM_RESUMING);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d callback = %0x\n", __func__, __LINE__, (void *) callback);
 	dev_pm_disable_wake_irq_check(dev, false);
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	retval = rpm_callback(callback, dev);
+	if (!strcmp(dev_name(dev), "0000:00:01.1"))
+		dev_info(dev, "%s %d\n", __func__, __LINE__);
 	if (retval) {
 		if (!strcmp(dev_name(dev), "0000:00:01.1"))
 			dev_info(dev, "%s %d\n", __func__, __LINE__);

This test is currently running (booted 13:05, 6.10.2025) and I'll expect a crash
after at least 24h of runtime.

Bert Karwatzki