[v1] thunderbolt: Fix PCIe device enumeration with delayed rescan

[PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Posted by Chia-Lin Kao (AceLan) 2 weeks, 4 days ago

PCIe devices behind Thunderbolt tunnels may fail to enumerate when
spurious hotplug events prevent pciehp from detecting link-up.

Root cause:

Spurious unplug events occur immediately after tunnel activation:

  [  932.438] thunderbolt: acking hot unplug event on 702:2
  [  932.852] thunderbolt: PCIe Up path activation complete
  [  932.855] thunderbolt: hotplug event for upstream port 702:2
            (unplug: 0)
  [  932.855] thunderbolt: hotplug event for upstream port 702:2
            (unplug: 1)

These events disrupt pciehp timing, causing device enumeration to fail
~70% of the time on affected hardware. Manual PCI rescan succeeds,
proving devices are present and functional on the bus.

Solution:

Schedule delayed work (300ms) after tunnel activation to:
1. Check if pciehp successfully enumerated devices (device count increased)
2. If not, trigger pci_rescan_bus() to discover devices manually
3. Log results for observability

The delayed work approach is non-blocking and only rescans when actually
needed, avoiding overhead on systems where pciehp works correctly.

Signed-off-by: Chia-Lin Kao (AceLan) <acelan.kao@canonical.com>
---
Logs: https://people.canonical.com/~acelan/bugs/tbt_storage/
merged.out.bad: Plugged-in TBT storage, but eventually fails to enumerate
merged.out.good: Plugged-in TBT storage, and successfully enumerates
merged.out.patched: Plugged-in TBT storage, it should fail without this
                    patch, but it works now
---
 drivers/thunderbolt/tb.c | 95 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 293fc9f258a5c..1cfc9a265c453 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -11,6 +11,7 @@
 #include <linux/delay.h>
 #include <linux/pm_runtime.h>
 #include <linux/platform_data/x86/apple.h>
+#include <linux/pci.h>

 #include "tb.h"
 #include "tb_regs.h"
@@ -18,6 +19,7 @@

 #define TB_TIMEOUT		100	/* ms */
 #define TB_RELEASE_BW_TIMEOUT	10000	/* ms */
+#define TB_PCIEHP_ENUMERATION_DELAY 300	/* ms */

 /*
  * How many time bandwidth allocation request from graphics driver is
@@ -83,6 +85,16 @@ struct tb_hotplug_event {
 	int retry;
 };

+/* Delayed work to verify PCIe enumeration after tunnel activation */
+struct tb_pci_rescan_work {
+	struct delayed_work work;
+	struct tb *tb;
+	struct pci_bus *bus;
+	int devices_before;
+	u64 route;
+	u8 port;
+};
+
 static void tb_scan_port(struct tb_port *port);
 static void tb_handle_hotplug(struct work_struct *work);
 static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
@@ -90,6 +102,60 @@ static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
 static void tb_queue_dp_bandwidth_request(struct tb *tb, u64 route, u8 port,
 					  int retry, unsigned long delay);

+static void tb_pci_rescan_work_fn(struct work_struct *work)
+{
+	struct tb_pci_rescan_work *rescan_work =
+		container_of(work, typeof(*rescan_work), work.work);
+	struct tb *tb = rescan_work->tb;
+	struct pci_bus *bus = rescan_work->bus;
+	int devices_after = 0;
+	struct pci_dev *dev;
+	struct tb_switch *sw;
+	struct tb_port *port;
+
+	mutex_lock(&tb->lock);
+
+	sw = tb_switch_find_by_route(tb, rescan_work->route);
+	if (!sw) {
+		tb_dbg(tb, "Switch at route %llx disappeared, skipping rescan\n",
+		       rescan_work->route);
+		goto out_unlock;
+	}
+
+	port = &sw->ports[rescan_work->port];
+
+	pci_lock_rescan_remove();
+	for_each_pci_dev(dev)
+		devices_after++;
+	pci_unlock_rescan_remove();
+
+	if (devices_after > rescan_work->devices_before) {
+		tb_port_dbg(port, "pciehp enumerated %d new device(s)\n",
+			    devices_after - rescan_work->devices_before);
+	} else {
+		tb_port_info(port, "pciehp failed to enumerate devices, triggering rescan\n");
+
+		pci_lock_rescan_remove();
+		pci_rescan_bus(bus);
+
+		devices_after = 0;
+		for_each_pci_dev(dev)
+			devices_after++;
+		pci_unlock_rescan_remove();
+
+		if (devices_after > rescan_work->devices_before)
+			tb_port_info(port, "rescan found %d new device(s)\n",
+				     devices_after - rescan_work->devices_before);
+		else
+			tb_port_warn(port, "no devices found even after rescan\n");
+	}
+
+	tb_switch_put(sw);
+out_unlock:
+	mutex_unlock(&tb->lock);
+	kfree(rescan_work);
+}
+
 static void tb_queue_hotplug(struct tb *tb, u64 route, u8 port, bool unplug)
 {
 	struct tb_hotplug_event *ev;
@@ -2400,6 +2466,35 @@ static int tb_tunnel_pci(struct tb *tb, struct tb_switch *sw)
 		tb_sw_warn(sw, "failed to connect xHCI\n");

 	list_add_tail(&tunnel->list, &tcm->tunnel_list);
+
+	/* Verify pciehp enumeration; trigger rescan if needed */
+	if (tb->nhi && tb->nhi->pdev && tb->nhi->pdev->bus) {
+		struct pci_bus *bus = tb->nhi->pdev->bus;
+		struct pci_bus *scan_bus = bus->parent ? bus->parent : bus;
+		struct tb_pci_rescan_work *rescan_work;
+		struct pci_dev *dev;
+		int devices_before = 0;
+
+		pci_lock_rescan_remove();
+		for_each_pci_dev(dev)
+			devices_before++;
+		pci_unlock_rescan_remove();
+
+		rescan_work = kmalloc_obj(rescan_work, GFP_KERNEL);
+		if (!rescan_work)
+			return 0;
+
+		rescan_work->tb = tb;
+		rescan_work->bus = scan_bus;
+		rescan_work->devices_before = devices_before;
+		rescan_work->route = tb_route(sw);
+		rescan_work->port = up->port;
+
+		INIT_DELAYED_WORK(&rescan_work->work, tb_pci_rescan_work_fn);
+		queue_delayed_work(tb->wq, &rescan_work->work,
+				   msecs_to_jiffies(TB_PCIEHP_ENUMERATION_DELAY));
+	}
+
 	return 0;
 }

--
2.51.0

Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Posted by Mika Westerberg 2 weeks, 4 days ago

Hi,

On Wed, Jan 21, 2026 at 01:27:44PM +0800, Chia-Lin Kao (AceLan) wrote:
> PCIe devices behind Thunderbolt tunnels may fail to enumerate when
> spurious hotplug events prevent pciehp from detecting link-up.
> 
> Root cause:
> 
> Spurious unplug events occur immediately after tunnel activation:
> 
>   [  932.438] thunderbolt: acking hot unplug event on 702:2
>   [  932.852] thunderbolt: PCIe Up path activation complete
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 0)
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 1)
> 
> These events disrupt pciehp timing, causing device enumeration to fail
> ~70% of the time on affected hardware. Manual PCI rescan succeeds,
> proving devices are present and functional on the bus.

Thanks for the report!

They are likely TB3 xHCI "plug" events or so but they should not affect
anything really.

It may be that there is something in the TB3 compatibility side that we are
not doing which needs to be investigated.

From your merged.out.bad:

CM does xHCI connect here:

[  152.905840] [182] thunderbolt 0000:c7:00.6: 702: xHCI connect request
[  152.906865] [182] thunderbolt 0000:c7:00.6: hotplug event for upstream port 702:2 (unplug: 0)
[  152.906869] [182] thunderbolt 0000:c7:00.6: 2:8: got plug event for connected port, ignoring
[  152.906872] [182] thunderbolt 0000:c7:00.6: hotplug event for upstream port 702:2 (unplug: 1)
[  152.906875] [182] thunderbolt 0000:c7:00.6: 2:8: got unplug event for disconnected port, ignoring

[  192.931373] [49] thunderbolt 0000:c7:00.6: acking hot unplug event on 2:7

Can you comment out call to tb_switch_xhci_connect() and see if that
changes anything?

> Solution:
> 
> Schedule delayed work (300ms) after tunnel activation to:
> 1. Check if pciehp successfully enumerated devices (device count increased)
> 2. If not, trigger pci_rescan_bus() to discover devices manually
> 3. Log results for observability
> 
> The delayed work approach is non-blocking and only rescans when actually
> needed, avoiding overhead on systems where pciehp works correctly.

There is no way we are going to call PCI functions from the tb.c.

Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Posted by Chia-Lin Kao (AceLan) 2 weeks, 2 days ago

On Wed, Jan 21, 2026 at 07:01:42AM +0100, Mika Westerberg wrote:
> Hi,
>
> On Wed, Jan 21, 2026 at 01:27:44PM +0800, Chia-Lin Kao (AceLan) wrote:
> > PCIe devices behind Thunderbolt tunnels may fail to enumerate when
> > spurious hotplug events prevent pciehp from detecting link-up.
> >
> > Root cause:
> >
> > Spurious unplug events occur immediately after tunnel activation:
> >
> >   [  932.438] thunderbolt: acking hot unplug event on 702:2
> >   [  932.852] thunderbolt: PCIe Up path activation complete
> >   [  932.855] thunderbolt: hotplug event for upstream port 702:2
> >             (unplug: 0)
> >   [  932.855] thunderbolt: hotplug event for upstream port 702:2
> >             (unplug: 1)
> >
> > These events disrupt pciehp timing, causing device enumeration to fail
> > ~70% of the time on affected hardware. Manual PCI rescan succeeds,
> > proving devices are present and functional on the bus.
>
> Thanks for the report!
>
> They are likely TB3 xHCI "plug" events or so but they should not affect
> anything really.
>
> It may be that there is something in the TB3 compatibility side that we are
> not doing which needs to be investigated.
>
> From your merged.out.bad:
>
> CM does xHCI connect here:
>
> [  152.905840] [182] thunderbolt 0000:c7:00.6: 702: xHCI connect request
> [  152.906865] [182] thunderbolt 0000:c7:00.6: hotplug event for upstream port 702:2 (unplug: 0)
> [  152.906869] [182] thunderbolt 0000:c7:00.6: 2:8: got plug event for connected port, ignoring
> [  152.906872] [182] thunderbolt 0000:c7:00.6: hotplug event for upstream port 702:2 (unplug: 1)
> [  152.906875] [182] thunderbolt 0000:c7:00.6: 2:8: got unplug event for disconnected port, ignoring
>
> [  192.931373] [49] thunderbolt 0000:c7:00.6: acking hot unplug event on 2:7
>
> Can you comment out call to tb_switch_xhci_connect() and see if that
> changes anything?
Here is what I modified, and the problem becomes a little bit complicated.

I did the following steps(1~5) and captured the tbtrace log at step 5.
https://people.canonical.com/~acelan/bugs/tbt_storage/merged.out.remove_tb_switch_xhci_connect.out
1. Plugged one tbt storage on the Dock and connected the dock to the
machine
2. Boot the machine up, and it recognizes the tbt storage
3. Plugged the second tbt storage on the dock, and it also can be
recognized(it was always failed in this step)
4. Unplugged the first and second tbt storage from the dock, and then
re-plugged the first tbt storage on the dock, and it can be recognized
5. Re-plugged the second tbt storage on the dock, and it fails.

(continue doing the following tests)
a. When the issue happens, re-plugging the second tbt storage doesn't
work.
b. Plugged both tbt storages on the dock, and then re-plugged the dock to
the machine, both tbt storages can be recognized.
	b.1 In this case, it works when re-plugging the first or the
	    second tbt storage on the dock(there is always one tbt storage
	    still connected to the dock)
	b.2 Removed both tbt storages from the dock, and then
	    re-plugged them one by one, and the second tbt storage can't be
	    recognized.
c. Plugged one tbt storage on the dock, and then re-connected the dock to
	the machine, the tbt storage can be recognized.
	c.1 Plugged the second tbt storage on the dock, and the second tbt
	    storage can be recognized.
	c.2 Re-plugged the first or the second tbt storage on the
	    dock, both tbt storages can be recognized.
	c.3 Removed both tbt storages from the dock, and then
	    re-plugged them one by one, and the second tbt storage can't be
	    recognized.(same as b.2)

The issue could be reproduced when connecting the second tbt storage
to the dock.
1. Connect the dock to the machine with any tbt storage
2. Or remove all tbt storages from the dock if the dock is connected
3. And then plugged tbt storages one by one, and the second one won't be
   recognized.

rescan finds the missing tbt storage, but it works only one time. Need
to rescan again when re-plugging the first or the second tbt storage.
   echo 1 | sudo tee /sys/bus/pci/rescan

BTW, when the second tbt storage can't be recognized, unplug the first tbt
storage from the dock and the second tbt storage can be recognized.
And then re-plugged the first tbt storage on the dock, it can't be
recognized. The behavior just looks like it's the second tbt storage.

diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 4f5f1dfc0fbf..be7ff82a3846 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -2503,8 +2503,8 @@ static void tb_handle_hotplug(struct work_struct *work)
        } else if (port->remote) {
                tb_port_dbg(port, "got plug event for connected port, ignoring\n");
        } else if (!port->port && sw->authorized) {
-               tb_sw_dbg(sw, "xHCI connect request\n");
-               tb_switch_xhci_connect(sw);
+               tb_sw_dbg(sw, "DEBUG: Comment out xHCI connect request\n");
+               //tb_switch_xhci_connect(sw);
        } else {
                if (tb_port_is_null(port)) {
                        tb_port_dbg(port, "hotplug: scanning\n");

>
> > Solution:
> >
> > Schedule delayed work (300ms) after tunnel activation to:
> > 1. Check if pciehp successfully enumerated devices (device count increased)
> > 2. If not, trigger pci_rescan_bus() to discover devices manually
> > 3. Log results for observability
> >
> > The delayed work approach is non-blocking and only rescans when actually
> > needed, avoiding overhead on systems where pciehp works correctly.
>
> There is no way we are going to call PCI functions from the tb.c.

Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Posted by Mika Westerberg 2 weeks, 2 days ago

Hi,

On Fri, Jan 23, 2026 at 10:04:11AM +0800, Chia-Lin Kao (AceLan) wrote:
> > Can you comment out call to tb_switch_xhci_connect() and see if that
> > changes anything?
>
> Here is what I modified, and the problem becomes a little bit complicated.

Okay I see it did not change anything (well this is kind of what I
expected). Thanks for trying.

I see in your log that the PCIe tunnel is established just fine. It's just
that there is no PCIe hotplug happening or it is happening but the PCIe
Downstream Port is not waking up.

I figured you have following USB4/TB topology, right?

  AMD Host <-> GR Hub <-> TB3 Hub
                  ^
                  |
                TB3 Hub

What if you run 'lspci' after the issue reproduces? Does that bring the
missing PCIe devices? I suspect that this is due to older TB3 devices that
they may need bit more time to get the PCIe link (going over the tunnel) up
and running.

Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Posted by AceLan Kao 2 weeks, 4 days ago

Chia-Lin Kao (AceLan) <acelan.kao@canonical.com> 於 2026年1月21日週三 下午1:27寫道：
>
> PCIe devices behind Thunderbolt tunnels may fail to enumerate when
> spurious hotplug events prevent pciehp from detecting link-up.
>
> Root cause:
>
> Spurious unplug events occur immediately after tunnel activation:
>
>   [  932.438] thunderbolt: acking hot unplug event on 702:2
>   [  932.852] thunderbolt: PCIe Up path activation complete
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 0)
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 1)
>
> These events disrupt pciehp timing, causing device enumeration to fail
> ~70% of the time on affected hardware. Manual PCI rescan succeeds,
> proving devices are present and functional on the bus.
>
> Solution:
>
> Schedule delayed work (300ms) after tunnel activation to:
> 1. Check if pciehp successfully enumerated devices (device count increased)
> 2. If not, trigger pci_rescan_bus() to discover devices manually
> 3. Log results for observability
>
> The delayed work approach is non-blocking and only rescans when actually
> needed, avoiding overhead on systems where pciehp works correctly.
>
> Signed-off-by: Chia-Lin Kao (AceLan) <acelan.kao@canonical.com>
> ---
> Logs: https://people.canonical.com/~acelan/bugs/tbt_storage/
> merged.out.bad: Plugged-in TBT storage, but eventually fails to enumerate
> merged.out.good: Plugged-in TBT storage, and successfully enumerates
> merged.out.patched: Plugged-in TBT storage, it should fail without this
>                     patch, but it works now
> ---
>  drivers/thunderbolt/tb.c | 95 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 95 insertions(+)
>
> diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
> index 293fc9f258a5c..1cfc9a265c453 100644
> --- a/drivers/thunderbolt/tb.c
> +++ b/drivers/thunderbolt/tb.c
> @@ -11,6 +11,7 @@
>  #include <linux/delay.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/platform_data/x86/apple.h>
> +#include <linux/pci.h>
>
>  #include "tb.h"
>  #include "tb_regs.h"
> @@ -18,6 +19,7 @@
>
>  #define TB_TIMEOUT             100     /* ms */
>  #define TB_RELEASE_BW_TIMEOUT  10000   /* ms */
> +#define TB_PCIEHP_ENUMERATION_DELAY 300        /* ms */
>
>  /*
>   * How many time bandwidth allocation request from graphics driver is
> @@ -83,6 +85,16 @@ struct tb_hotplug_event {
>         int retry;
>  };
>
> +/* Delayed work to verify PCIe enumeration after tunnel activation */
> +struct tb_pci_rescan_work {
> +       struct delayed_work work;
> +       struct tb *tb;
> +       struct pci_bus *bus;
> +       int devices_before;
> +       u64 route;
> +       u8 port;
> +};
> +
>  static void tb_scan_port(struct tb_port *port);
>  static void tb_handle_hotplug(struct work_struct *work);
>  static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
> @@ -90,6 +102,60 @@ static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
>  static void tb_queue_dp_bandwidth_request(struct tb *tb, u64 route, u8 port,
>                                           int retry, unsigned long delay);
>
> +static void tb_pci_rescan_work_fn(struct work_struct *work)
> +{
> +       struct tb_pci_rescan_work *rescan_work =
> +               container_of(work, typeof(*rescan_work), work.work);
> +       struct tb *tb = rescan_work->tb;
> +       struct pci_bus *bus = rescan_work->bus;
> +       int devices_after = 0;
> +       struct pci_dev *dev;
> +       struct tb_switch *sw;
> +       struct tb_port *port;
> +
> +       mutex_lock(&tb->lock);
> +
> +       sw = tb_switch_find_by_route(tb, rescan_work->route);
> +       if (!sw) {
> +               tb_dbg(tb, "Switch at route %llx disappeared, skipping rescan\n",
> +                      rescan_work->route);
> +               goto out_unlock;
> +       }
> +
> +       port = &sw->ports[rescan_work->port];
> +
> +       pci_lock_rescan_remove();
> +       for_each_pci_dev(dev)
> +               devices_after++;
> +       pci_unlock_rescan_remove();
> +
> +       if (devices_after > rescan_work->devices_before) {
> +               tb_port_dbg(port, "pciehp enumerated %d new device(s)\n",
> +                           devices_after - rescan_work->devices_before);
> +       } else {
> +               tb_port_info(port, "pciehp failed to enumerate devices, triggering rescan\n");
> +
> +               pci_lock_rescan_remove();
> +               pci_rescan_bus(bus);
> +
> +               devices_after = 0;
> +               for_each_pci_dev(dev)
> +                       devices_after++;
> +               pci_unlock_rescan_remove();
> +
> +               if (devices_after > rescan_work->devices_before)
> +                       tb_port_info(port, "rescan found %d new device(s)\n",
> +                                    devices_after - rescan_work->devices_before);
> +               else
> +                       tb_port_warn(port, "no devices found even after rescan\n");
> +       }
> +
> +       tb_switch_put(sw);
> +out_unlock:
> +       mutex_unlock(&tb->lock);
> +       kfree(rescan_work);
> +}
> +
>  static void tb_queue_hotplug(struct tb *tb, u64 route, u8 port, bool unplug)
>  {
>         struct tb_hotplug_event *ev;
> @@ -2400,6 +2466,35 @@ static int tb_tunnel_pci(struct tb *tb, struct tb_switch *sw)
>                 tb_sw_warn(sw, "failed to connect xHCI\n");
>
>         list_add_tail(&tunnel->list, &tcm->tunnel_list);
> +
> +       /* Verify pciehp enumeration; trigger rescan if needed */
> +       if (tb->nhi && tb->nhi->pdev && tb->nhi->pdev->bus) {
> +               struct pci_bus *bus = tb->nhi->pdev->bus;
> +               struct pci_bus *scan_bus = bus->parent ? bus->parent : bus;
> +               struct tb_pci_rescan_work *rescan_work;
> +               struct pci_dev *dev;
> +               int devices_before = 0;
> +
> +               pci_lock_rescan_remove();
> +               for_each_pci_dev(dev)
> +                       devices_before++;
> +               pci_unlock_rescan_remove();
> +
> +               rescan_work = kmalloc_obj(rescan_work, GFP_KERNEL);
Sorry, didn't re-check after checkpatch modified it.
kmalloc_obj() is undefined here.
I'll submit v2 later.

> +               if (!rescan_work)
> +                       return 0;
> +
> +               rescan_work->tb = tb;
> +               rescan_work->bus = scan_bus;
> +               rescan_work->devices_before = devices_before;
> +               rescan_work->route = tb_route(sw);
> +               rescan_work->port = up->port;
> +
> +               INIT_DELAYED_WORK(&rescan_work->work, tb_pci_rescan_work_fn);
> +               queue_delayed_work(tb->wq, &rescan_work->work,
> +                                  msecs_to_jiffies(TB_PCIEHP_ENUMERATION_DELAY));
> +       }
> +
>         return 0;
>  }
>
> --
> 2.51.0
>


-- 
Chia-Lin Kao(AceLan)
http://blog.acelan.idv.tw/
E-Mail: acelan.kaoATcanonical.com (s/AT/@/)