drivers/usb/host/xhci-ring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
Currently xhci_handshake is a polling loop that waits for change of state.
If this loop is executed while holding a spinlock with IRQs disabled, it
can block interrupts for up to 5 seconds.
To prevent prolonged IRQ disable durations that may lead to watchdog
timeouts, release the spinlock before invoking xhci_handshake() in
xhci_abort_cmd_ring().
The spinlock is reacquired after the handshake to continue with controller
halt and recovery if needed.
Signed-off-by: Uttkarsh Aggarwal <uttkarsh.aggarwal@oss.qualcomm.com>
---
drivers/usb/host/xhci-ring.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 8e209aa33ea7..fca4df6a4699 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -518,10 +518,12 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags)
* In the future we should distinguish between -ENODEV and -ETIMEDOUT
* and try to recover a -ETIMEDOUT with a host controller reset.
*/
+ spin_unlock_irqrestore(&xhci->lock, flags);
ret = xhci_handshake(&xhci->op_regs->cmd_ring,
CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
if (ret < 0) {
xhci_err(xhci, "Abort failed to stop command ring: %d\n", ret);
+ spin_lock_irqsave(&xhci->lock, flags);
xhci_halt(xhci);
xhci_hc_died(xhci);
return ret;
@@ -532,7 +534,6 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags)
* but the completion event in never sent. Wait 2 secs (arbitrary
* number) to handle those cases after negation of CMD_RING_RUNNING.
*/
- spin_unlock_irqrestore(&xhci->lock, flags);
ret = wait_for_completion_timeout(&xhci->cmd_ring_stop_completion,
msecs_to_jiffies(2000));
spin_lock_irqsave(&xhci->lock, flags);
--
2.17.1
On 10/22/25 13:00, Uttkarsh Aggarwal wrote: > Currently xhci_handshake is a polling loop that waits for change of state. > If this loop is executed while holding a spinlock with IRQs disabled, it > can block interrupts for up to 5 seconds. > > To prevent prolonged IRQ disable durations that may lead to watchdog > timeouts, release the spinlock before invoking xhci_handshake() in > xhci_abort_cmd_ring(). > > The spinlock is reacquired after the handshake to continue with controller > halt and recovery if needed. Is this a real issue? It should be extremely rare that commands need to be aborted, and even less likely that it takes five seconds to stop the command ring. Can you take logs and traces of this (if it's reproducible). I suspect there is some other issue that needs to be fixed. I agree that keeping the spin lock for up to five seconds isn't a good idea, but just unlocking before the command ring has stopped opens up new race risks. We need to ensure that queuing a new command mid ring abortion, before ring stopped, doesn't cause new issues, or that handling command completion events, including the "stop command ring" event, before polling for a stopped ring works fine. Thanks Mathias
On 10/22/2025 6:19 PM, Mathias Nyman wrote: > On 10/22/25 13:00, Uttkarsh Aggarwal wrote: >> Currently xhci_handshake is a polling loop that waits for change of >> state. >> If this loop is executed while holding a spinlock with IRQs disabled, it >> can block interrupts for up to 5 seconds. >> >> To prevent prolonged IRQ disable durations that may lead to watchdog >> timeouts, release the spinlock before invoking xhci_handshake() in >> xhci_abort_cmd_ring(). >> >> The spinlock is reacquired after the handshake to continue with >> controller >> halt and recovery if needed. > > Is this a real issue? > > It should be extremely rare that commands need to be aborted, and even > less likely that it takes five seconds to stop the command ring. > > Can you take logs and traces of this (if it's reproducible). > I suspect there is some other issue that needs to be fixed. > > I agree that keeping the spin lock for up to five seconds isn't a good > idea, but > just unlocking before the command ring has stopped opens up new race risks. > > We need to ensure that queuing a new command mid ring abortion, before > ring stopped, > doesn't cause new issues, or that handling command completion events, > including the > "stop command ring" event, before polling for a stopped ring works fine. > Hi Mathias, Yes, actually when we do usb headset connected, no audio playing, resume the device and do some operations like volume +/-, there will be xhci_handle_command_timeout. logs: It’s wdog bark, we can see : xhci_handle_command_timeout kicked in since 68346, but not end till 68356 dog bark. [68346.109071][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_handle_command_timeout start [68346.109081][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_handle_command_timeout xhci_abort_cmd_ring start [68346.109083][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring start [68346.109084][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring xhci_trb_virt_to_dma [68346.109087][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring xhci_handshake_check_state start …. [68356.215787][ C1] [2025:09:04 23:19:10](1)msm_watchdog 17600000.qcom,wdt: Causing a QCOM Apps Watchdog bite! [68356.215790][ C1] [2025:09:04 23:19:10](1)[RB/E]rb_sreason_str_set: sreason_str set wdog_bite Call Trace: xhci_configure_endpoint hang in below call stack. Stack: __switch_to+0x1a8 schedule+0xacc schedule_timeout+0x4c wait_for_common+0xc0 wait_for_completion+0x18 xhci_configure_endpoint+0xe4 xhci_check_bandwidth+0x1a8 usb_hcd_alloc_bandwidth+0x37c usb_disable_device_endpoints+0xb0 usb_disable_device+0x30 usb_disconnect+0xd0 hub_event+0xc4c process_scheduled_works+0x208 worker_thread+0x1c0 kthread+0xfc ret_from_fork+0x10 Finally crash on dog bite. sdei_wdg_bite_cb[qcom_sdei]+0x110 sdei_event_handler+0x3c do_sdei_event+0x88 __sdei_handler+0x48 __sdei_asm_handler+0x110 __const_udelay+0x148 xhci_handshake+0x58 xhci_handle_command_timeout+0x248 process_scheduled_works+0x208 worker_thread+0x1c0 kthread+0xfc ret_from_fork+0x10 Based on the logs and crash stacks I have collected so far, including the watchdog bark and the prolonged execution of xhci_handle_command_timeout, it seems likely that there's an underlying issue causing the ring to hang during xhci_configure_endpoint. Thanks, Uttkarsh,
On 11/6/25 11:57, Uttkarsh Aggarwal wrote: > > > On 10/22/2025 6:19 PM, Mathias Nyman wrote: >> On 10/22/25 13:00, Uttkarsh Aggarwal wrote: >>> Currently xhci_handshake is a polling loop that waits for change of state. >>> If this loop is executed while holding a spinlock with IRQs disabled, it >>> can block interrupts for up to 5 seconds. >>> >>> To prevent prolonged IRQ disable durations that may lead to watchdog >>> timeouts, release the spinlock before invoking xhci_handshake() in >>> xhci_abort_cmd_ring(). >>> >>> The spinlock is reacquired after the handshake to continue with controller >>> halt and recovery if needed. >> >> Is this a real issue? >> >> It should be extremely rare that commands need to be aborted, and even >> less likely that it takes five seconds to stop the command ring. >> >> Can you take logs and traces of this (if it's reproducible). >> I suspect there is some other issue that needs to be fixed. >> >> I agree that keeping the spin lock for up to five seconds isn't a good idea, but >> just unlocking before the command ring has stopped opens up new race risks. >> >> We need to ensure that queuing a new command mid ring abortion, before ring stopped, >> doesn't cause new issues, or that handling command completion events, including the >> "stop command ring" event, before polling for a stopped ring works fine. >> > > Hi Mathias, > > Yes, actually when we do usb headset connected, no audio playing, resume the device and do some operations like volume +/-, there will be xhci_handle_command_timeout. > > logs: > > It’s wdog bark, we can see : xhci_handle_command_timeout kicked in since 68346, but not end till 68356 dog bark. > > [68346.109071][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_handle_command_timeout start > [68346.109081][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_handle_command_timeout xhci_abort_cmd_ring start > [68346.109083][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring start > [68346.109084][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring xhci_trb_virt_to_dma > [68346.109087][T25562] [2025:09:04 23:19:02](2)[25562:kworker/2:3]lwz debug: xhci_abort_cmd_ring xhci_handshake_check_state start > …. > [68356.215787][ C1] [2025:09:04 23:19:10](1)msm_watchdog 17600000.qcom,wdt: Causing a QCOM Apps Watchdog bite! > [68356.215790][ C1] [2025:09:04 23:19:10](1)[RB/E]rb_sreason_str_set: sreason_str set wdog_bite > > Call Trace: > xhci_configure_endpoint hang in below call stack. > > Stack: > __switch_to+0x1a8 > schedule+0xacc > schedule_timeout+0x4c > wait_for_common+0xc0 > wait_for_completion+0x18 > xhci_configure_endpoint+0xe4 > xhci_check_bandwidth+0x1a8 > usb_hcd_alloc_bandwidth+0x37c > usb_disable_device_endpoints+0xb0 > usb_disable_device+0x30 > usb_disconnect+0xd0 > hub_event+0xc4c > process_scheduled_works+0x208 > worker_thread+0x1c0 > kthread+0xfc > ret_from_fork+0x10 > > Finally crash on dog bite. > > sdei_wdg_bite_cb[qcom_sdei]+0x110 > sdei_event_handler+0x3c > do_sdei_event+0x88 > __sdei_handler+0x48 > __sdei_asm_handler+0x110 > __const_udelay+0x148 > xhci_handshake+0x58 > xhci_handle_command_timeout+0x248 > process_scheduled_works+0x208 > worker_thread+0x1c0 > kthread+0xfc > ret_from_fork+0x10 > > Based on the logs and crash stacks I have collected so far, including the watchdog bark and the prolonged execution of xhci_handle_command_timeout, > it seems likely that there's an underlying issue causing the ring to hang during xhci_configure_endpoint. > > Thanks, > Uttkarsh, > Thanks Uttkarsh Best guess is that this is somehow related to xHC suspend/resume as xHC and command processing are stopped during suspend. xhci_suspend() also wipes the command ring clean of any pending command TRBs without any warning. A suspended host could also return 0xffffffff on register read, which in turn would cause xhci_handshake(reg, CMD_RING_RUNNING, 0, 5sec) to run for full five seconds unsuccessfully waiting for the "command ring running" bit to clear. Could you add xhci dynamic debug and tracing and reproduce this? It could show how we end up in this situation. mount -t debugfs none /sys/kernel/debug echo 'module xhci_hcd =p' >/sys/kernel/debug/dynamic_debug/control echo 'module usbcore =p' >/sys/kernel/debug/dynamic_debug/control echo 81920 > /sys/kernel/debug/tracing/buffer_size_kb echo 1 > /sys/kernel/debug/tracing/events/xhci-hcd/enable echo 1 > /sys/kernel/debug/tracing/tracing_on < Reproduce issue > Send output of dmesg Send content of /sys/kernel/debug/tracing/trace I'll also write a testpatch with extra debugging related to command ring Thanks Mathias
> > A suspended host could also return 0xffffffff on register read, which in turn would cause > xhci_handshake(reg, CMD_RING_RUNNING, 0, 5sec) to run for full five seconds unsuccessfully > waiting for the "command ring running" bit to clear. > Forgot that xhci_handshake() does check for 0xffffffff, and will return -ENODEV, so this is not part of the issue -Mathias
print messages with CMDDBG prefix if command ring is cleared
with pending commands, or fails to start due to sw state
TESTPATCH
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
---
drivers/usb/host/xhci-ring.c | 5 ++++-
drivers/usb/host/xhci.c | 16 ++++++++++++++++
2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index b9e676d37a33..f8029d837428 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -421,8 +421,11 @@ static unsigned int xhci_ring_expansion_needed(struct xhci_hcd *xhci, struct xhc
/* Ring the host controller doorbell after placing a command on the ring */
void xhci_ring_cmd_db(struct xhci_hcd *xhci)
{
- if (!(xhci->cmd_ring_state & CMD_RING_STATE_RUNNING))
+ if (!(xhci->cmd_ring_state & CMD_RING_STATE_RUNNING)) {
+ xhci_err(xhci, "%pS Failed to start cmd ring, sw cmd_ring_state %d\n",
+ __builtin_return_address(0), xhci->cmd_ring_state);
return;
+ }
xhci_dbg(xhci, "// Ding dong!\n");
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index d3063f796130..fe157813c136 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -504,6 +504,14 @@ static void xhci_set_cmd_ring_deq(struct xhci_hcd *xhci)
dma_addr_t deq_dma;
u64 crcr;
+ if (!list_empty(&xhci->cmd_list))
+ xhci_err(xhci, "%pS CMDDBG set cmd ring deq pendig commands\n",
+ __builtin_return_address(0));
+ if (xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue)
+ xhci_err(xhci,"%pS CMDDBG set cmd ring deq: %p and enq %p mismatch\n",
+ __builtin_return_address(0),
+ xhci->cmd_ring->dequeue, xhci->cmd_ring->enqueue);
+
deq_dma = xhci_trb_virt_to_dma(xhci->cmd_ring->deq_seg, xhci->cmd_ring->dequeue);
deq_dma &= CMD_RING_PTR_MASK;
@@ -871,6 +879,14 @@ static void xhci_clear_command_ring(struct xhci_hcd *xhci)
struct xhci_segment *seg;
ring = xhci->cmd_ring;
+
+ if (!list_empty(&xhci->cmd_list))
+ xhci_err(xhci, "CMDDBG Clearing command ring with pendig commands\n");
+
+ if (ring->dequeue != ring->enqueue)
+ xhci_err(xhci,"CMDDBG clear cmd ring deq: %p and enq %p mismatch\n",
+ ring->dequeue, ring->enqueue);
+
xhci_for_each_ring_seg(ring->first_seg, seg) {
/* erase all TRBs before the link */
memset(seg->trbs, 0, sizeof(union xhci_trb) * (TRBS_PER_SEGMENT - 1));
--
2.43.0
© 2016 - 2026 Red Hat, Inc.