drivers/media/rc/imon.c | 69 +++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 27 deletions(-)
syzbot is reporting that imon has three problems which result in hung tasks
due to forever holding device lock.
First problem is that when usb_rx_callback_intf0() once got -EPROTO error
after ictx->dev_present_intf0 became true, usb_rx_callback_intf0()
resubmits urb after printk(), and resubmitted urb causes
usb_rx_callback_intf0() to again get -EPROTO error. This results in
printk() flooding (RCU stalls).
Commit 92f461517d22 ("media: ir_toy: do not resubmit broken urb") changed
ir_toy module not to resubmit when irtoy_in_callback() got -EPROTO error.
We should do similar thing for imon.
Basically, I think that imon should refrain from resubmitting urb when
callback function got an error. But since I don't know which error codes
should retry resubmitting urb, this patch handles only union of error codes
chosen from modules in drivers/media/rc/ directory which handles -EPROTO
error (i.e. ir_toy, mceusb and igorplugusb).
We need to decide whether to call usb_unlink_urb() when we got -EPROTO
error. ir_toy and mceusb call usb_unlink_urb() but igorplugusb does not
due to commit 5e4029056263 ("media: igorplugusb: remove superfluous
usb_unlink_urb()"). This patch calls usb_unlink_urb() because description
of usb_unlink_urb() suggests that it is OK to call.
Second problem is that when usb_rx_callback_intf0() once got -EPROTO error
before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always
resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge
hardware after early callbacks"). If some errors should stop resubmitting
urb regardless of whether configuring the hardware has completed or not,
what that commit is doing is wrong. The ictx->dev_present_intf0 test was
introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes
until intf configured"), but that commit did not call usb_unlink_urb()
when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0
test to immediately before imon_incoming_packet() so that we can call
usb_unlink_urb() as needed, or the first problem explained above happens
without printk() flooding (i.e. hung task).
Third problem is that when usb_rx_callback_intf0() is not called for some
reason (e.g. flaky hardware; the reproducer for this problem sometimes
prevents usb_rx_callback_intf0() from being called),
wait_for_completion_interruptible() in send_packet() never returns (i.e.
hung task). As a workaround for such situation, change send_packet() to
wait for completion with 10 seconds of timeout.
Also, move mutex_trylock() in imon_ir_change_protocol() to the beginning,
for memcpy() which modifies ictx->usb_tx_buf should be protected by
ictx->lock.
Also, verify at the beginning of send_packet() that ictx->lock is held
in case send_packet() is by error called from imon_ir_change_protocol()
when mutex_trylock() failed due to concurrent requests.
Link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
#syz test
drivers/media/rc/imon.c | 69 +++++++++++++++++++++++++----------------
1 file changed, 42 insertions(+), 27 deletions(-)
diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c
index f5221b018808..3469a401a572 100644
--- a/drivers/media/rc/imon.c
+++ b/drivers/media/rc/imon.c
@@ -598,6 +598,8 @@ static int send_packet(struct imon_context *ictx)
int retval = 0;
struct usb_ctrlrequest *control_req = NULL;
+ lockdep_assert_held(&ictx->lock);
+
/* Check if we need to use control or interrupt urb */
if (!ictx->tx_control) {
pipe = usb_sndintpipe(ictx->usbdev_intf0,
@@ -645,12 +647,15 @@ static int send_packet(struct imon_context *ictx)
smp_rmb(); /* ensure later readers know we're not busy */
pr_err_ratelimited("error submitting urb(%d)\n", retval);
} else {
- /* Wait for transmission to complete (or abort) */
- retval = wait_for_completion_interruptible(
- &ictx->tx.finished);
- if (retval) {
+ /* Wait for transmission to complete (or abort or timeout) */
+ retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ);
+ if (retval <= 0) {
usb_kill_urb(ictx->tx_urb);
pr_err_ratelimited("task interrupted\n");
+ if (retval < 0)
+ ictx->tx.status = retval;
+ else
+ ictx->tx.status = -ETIMEDOUT;
}
ictx->tx.busy = false;
@@ -1121,7 +1126,7 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
int retval;
struct imon_context *ictx = rc->priv;
struct device *dev = ictx->dev;
- bool unlock = false;
+ const bool unlock = mutex_trylock(&ictx->lock);
unsigned char ir_proto_packet[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86 };
@@ -1148,8 +1153,6 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet));
- unlock = mutex_trylock(&ictx->lock);
-
retval = send_packet(ictx);
if (retval)
goto out;
@@ -1745,14 +1748,6 @@ static void usb_rx_callback_intf0(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf0)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1761,16 +1756,30 @@ static void usb_rx_callback_intf0(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf0)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ usb_unlink_urb(urb);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf0, GFP_ATOMIC);
}
@@ -1786,14 +1795,6 @@ static void usb_rx_callback_intf1(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf1)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1802,16 +1803,30 @@ static void usb_rx_callback_intf1(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf1)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ usb_unlink_urb(urb);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf1, GFP_ATOMIC);
}
--
2.50.1
Hello, syzbot has tested the proposed patch and the reproducer did not trigger any issue: Reported-by: syzbot+592e2ab8775dbe0bf09a@syzkaller.appspotmail.com Tested-by: syzbot+592e2ab8775dbe0bf09a@syzkaller.appspotmail.com Tested on: commit: 3f31a806 Merge tag 'mm-hotfixes-stable-2025-07-11-16-1.. git tree: upstream console output: https://syzkaller.appspot.com/x/log.txt?x=16a150f0580000 kernel config: https://syzkaller.appspot.com/x/.config?x=f481202e4ff2d138 dashboard link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a compiler: Debian clang version 20.1.7 (++20250616065708+6146a88f6049-1~exp1~20250616065826.132), Debian LLD 20.1.7 patch: https://syzkaller.appspot.com/x/patch.diff?x=15fa07d4580000 Note: testing is done by a robot and is best-effort only.
[loop Alan in] On Sun, 13 Jul 2025 16:50:08 +0900 Tetsuo Handa wrote: > syzbot is reporting that imon has three problems which result in hung tasks > due to forever holding device lock. > > First problem is that when usb_rx_callback_intf0() once got -EPROTO error > after ictx->dev_present_intf0 became true, usb_rx_callback_intf0() > resubmits urb after printk(), and resubmitted urb causes > usb_rx_callback_intf0() to again get -EPROTO error. This results in > printk() flooding (RCU stalls). > > Commit 92f461517d22 ("media: ir_toy: do not resubmit broken urb") changed > ir_toy module not to resubmit when irtoy_in_callback() got -EPROTO error. > We should do similar thing for imon. > > Basically, I think that imon should refrain from resubmitting urb when > callback function got an error. But since I don't know which error codes > should retry resubmitting urb, this patch handles only union of error codes > chosen from modules in drivers/media/rc/ directory which handles -EPROTO > error (i.e. ir_toy, mceusb and igorplugusb). > > We need to decide whether to call usb_unlink_urb() when we got -EPROTO > error. ir_toy and mceusb call usb_unlink_urb() but igorplugusb does not > due to commit 5e4029056263 ("media: igorplugusb: remove superfluous > usb_unlink_urb()"). This patch calls usb_unlink_urb() because description > of usb_unlink_urb() suggests that it is OK to call. > > Second problem is that when usb_rx_callback_intf0() once got -EPROTO error > before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always > resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge > hardware after early callbacks"). If some errors should stop resubmitting > urb regardless of whether configuring the hardware has completed or not, > what that commit is doing is wrong. The ictx->dev_present_intf0 test was > introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes > until intf configured"), but that commit did not call usb_unlink_urb() > when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0 > test to immediately before imon_incoming_packet() so that we can call > usb_unlink_urb() as needed, or the first problem explained above happens > without printk() flooding (i.e. hung task). > > Third problem is that when usb_rx_callback_intf0() is not called for some > reason (e.g. flaky hardware; the reproducer for this problem sometimes > prevents usb_rx_callback_intf0() from being called), > wait_for_completion_interruptible() in send_packet() never returns (i.e. > hung task). As a workaround for such situation, change send_packet() to > wait for completion with 10 seconds of timeout. > > Also, move mutex_trylock() in imon_ir_change_protocol() to the beginning, > for memcpy() which modifies ictx->usb_tx_buf should be protected by > ictx->lock. > > Also, verify at the beginning of send_packet() that ictx->lock is held > in case send_packet() is by error called from imon_ir_change_protocol() > when mutex_trylock() failed due to concurrent requests. > > Link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> > --- > #syz test > > drivers/media/rc/imon.c | 69 +++++++++++++++++++++++++---------------- > 1 file changed, 42 insertions(+), 27 deletions(-) > > diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c > index f5221b018808..3469a401a572 100644 > --- a/drivers/media/rc/imon.c > +++ b/drivers/media/rc/imon.c > @@ -598,6 +598,8 @@ static int send_packet(struct imon_context *ictx) > int retval = 0; > struct usb_ctrlrequest *control_req = NULL; > > + lockdep_assert_held(&ictx->lock); > + > /* Check if we need to use control or interrupt urb */ > if (!ictx->tx_control) { > pipe = usb_sndintpipe(ictx->usbdev_intf0, > @@ -645,12 +647,15 @@ static int send_packet(struct imon_context *ictx) > smp_rmb(); /* ensure later readers know we're not busy */ > pr_err_ratelimited("error submitting urb(%d)\n", retval); > } else { > - /* Wait for transmission to complete (or abort) */ > - retval = wait_for_completion_interruptible( > - &ictx->tx.finished); > - if (retval) { > + /* Wait for transmission to complete (or abort or timeout) */ > + retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ); Is the underlying hardware is not stable if the submitted urb failed to complete within 10 seconds for example? In the product environment is it making sense to ask for change to BOM, bill of material, if 10s timedout could be reliably reproduced twice a month? > + if (retval <= 0) { > usb_kill_urb(ictx->tx_urb); > pr_err_ratelimited("task interrupted\n"); > + if (retval < 0) > + ictx->tx.status = retval; > + else > + ictx->tx.status = -ETIMEDOUT; > } > > ictx->tx.busy = false; > @@ -1121,7 +1126,7 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto) > int retval; > struct imon_context *ictx = rc->priv; > struct device *dev = ictx->dev; > - bool unlock = false; > + const bool unlock = mutex_trylock(&ictx->lock); > unsigned char ir_proto_packet[] = { > 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86 }; > > @@ -1148,8 +1153,6 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto) > > memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet)); > > - unlock = mutex_trylock(&ictx->lock); > - > retval = send_packet(ictx); > if (retval) > goto out; > @@ -1745,14 +1748,6 @@ static void usb_rx_callback_intf0(struct urb *urb) > if (!ictx) > return; > > - /* > - * if we get a callback before we're done configuring the hardware, we > - * can't yet process the data, as there's nowhere to send it, but we > - * still need to submit a new rx URB to avoid wedging the hardware > - */ > - if (!ictx->dev_present_intf0) > - goto out; > - > switch (urb->status) { > case -ENOENT: /* usbcore unlink successful! */ > return; > @@ -1761,16 +1756,30 @@ static void usb_rx_callback_intf0(struct urb *urb) > break; > > case 0: > - imon_incoming_packet(ictx, urb, intfnum); > + /* > + * if we get a callback before we're done configuring the hardware, we > + * can't yet process the data, as there's nowhere to send it, but we > + * still need to submit a new rx URB to avoid wedging the hardware > + */ > + if (ictx->dev_present_intf0) > + imon_incoming_packet(ictx, urb, intfnum); > break; > > + case -ECONNRESET: > + case -EILSEQ: > + case -EPROTO: > + case -EPIPE: > + dev_warn(ictx->dev, "imon %s: status(%d)\n", > + __func__, urb->status); > + usb_unlink_urb(urb); > + return; > + > default: > dev_warn(ictx->dev, "imon %s: status(%d): ignored\n", > __func__, urb->status); > break; > } > > -out: > usb_submit_urb(ictx->rx_urb_intf0, GFP_ATOMIC); > } > > @@ -1786,14 +1795,6 @@ static void usb_rx_callback_intf1(struct urb *urb) > if (!ictx) > return; > > - /* > - * if we get a callback before we're done configuring the hardware, we > - * can't yet process the data, as there's nowhere to send it, but we > - * still need to submit a new rx URB to avoid wedging the hardware > - */ > - if (!ictx->dev_present_intf1) > - goto out; > - > switch (urb->status) { > case -ENOENT: /* usbcore unlink successful! */ > return; > @@ -1802,16 +1803,30 @@ static void usb_rx_callback_intf1(struct urb *urb) > break; > > case 0: > - imon_incoming_packet(ictx, urb, intfnum); > + /* > + * if we get a callback before we're done configuring the hardware, we > + * can't yet process the data, as there's nowhere to send it, but we > + * still need to submit a new rx URB to avoid wedging the hardware > + */ > + if (ictx->dev_present_intf1) > + imon_incoming_packet(ictx, urb, intfnum); > break; > > + case -ECONNRESET: > + case -EILSEQ: > + case -EPROTO: > + case -EPIPE: > + dev_warn(ictx->dev, "imon %s: status(%d)\n", > + __func__, urb->status); > + usb_unlink_urb(urb); > + return; > + > default: > dev_warn(ictx->dev, "imon %s: status(%d): ignored\n", > __func__, urb->status); > break; > } > > -out: > usb_submit_urb(ictx->rx_urb_intf1, GFP_ATOMIC); > } > > -- > 2.50.1 > > >
On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: > [loop Alan in] I assume you're interested in the question of when to avoid resubmitting URBs. > On Sun, 13 Jul 2025 16:50:08 +0900 Tetsuo Handa wrote: > > syzbot is reporting that imon has three problems which result in hung tasks > > due to forever holding device lock. > > > > First problem is that when usb_rx_callback_intf0() once got -EPROTO error > > after ictx->dev_present_intf0 became true, usb_rx_callback_intf0() > > resubmits urb after printk(), and resubmitted urb causes > > usb_rx_callback_intf0() to again get -EPROTO error. This results in > > printk() flooding (RCU stalls). > > > > Commit 92f461517d22 ("media: ir_toy: do not resubmit broken urb") changed > > ir_toy module not to resubmit when irtoy_in_callback() got -EPROTO error. > > We should do similar thing for imon. > > > > Basically, I think that imon should refrain from resubmitting urb when > > callback function got an error. But since I don't know which error codes > > should retry resubmitting urb, this patch handles only union of error codes > > chosen from modules in drivers/media/rc/ directory which handles -EPROTO > > error (i.e. ir_toy, mceusb and igorplugusb). In theory it's okay to resubmit _if_ the driver has a robust error-recovery scheme (such as giving up after some fixed limit on the number of errors or after some fixed time has elapsed, perhaps with a time delay to prevent a flood of errors). Most drivers don't bother to do this; they simply give up right away. This makes them more vulnerable to short-term noise interference during USB transfers, but in reality such interference is quite rare. There's nothing really wrong with giving up right away. As to which error codes drivers should pay attention to... In most cases they only look at -EPROTO. According to Documentation/driver-api/usb/error-codes.rst, -EILSEQ and -ETIME are also possible errors when a device has been unplugged, so it wouldn't hurt to check for them too. But most host controller drivers don't bother to issue them; -EPROTO is by far the most common error code following an unplug. > > We need to decide whether to call usb_unlink_urb() when we got -EPROTO > > error. ir_toy and mceusb call usb_unlink_urb() but igorplugusb does not > > due to commit 5e4029056263 ("media: igorplugusb: remove superfluous > > usb_unlink_urb()"). This patch calls usb_unlink_urb() because description > > of usb_unlink_urb() suggests that it is OK to call. If the error occurred because the device was unplugged then unlinking the outstanding URBs isn't necessary; the USB core will unlink them for you after the device's parent hub reports that the unplug took place. > > Second problem is that when usb_rx_callback_intf0() once got -EPROTO error > > before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always > > resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge > > hardware after early callbacks"). If some errors should stop resubmitting > > urb regardless of whether configuring the hardware has completed or not, > > what that commit is doing is wrong. The ictx->dev_present_intf0 test was > > introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes > > until intf configured"), but that commit did not call usb_unlink_urb() > > when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0 > > test to immediately before imon_incoming_packet() so that we can call > > usb_unlink_urb() as needed, or the first problem explained above happens > > without printk() flooding (i.e. hung task). It seems odd for a driver to set up normal communications with a device before the device has been configured, but of course that decision is up to the creators and maintainers of the driver. Alan Stern
Hi Alan, On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: > On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: > > [loop Alan in] > > I assume you're interested in the question of when to avoid resubmitting > URBs. > > > On Sun, 13 Jul 2025 16:50:08 +0900 Tetsuo Handa wrote: > > > syzbot is reporting that imon has three problems which result in hung tasks > > > due to forever holding device lock. > > > > > > First problem is that when usb_rx_callback_intf0() once got -EPROTO error > > > after ictx->dev_present_intf0 became true, usb_rx_callback_intf0() > > > resubmits urb after printk(), and resubmitted urb causes > > > usb_rx_callback_intf0() to again get -EPROTO error. This results in > > > printk() flooding (RCU stalls). > > > > > > Commit 92f461517d22 ("media: ir_toy: do not resubmit broken urb") changed > > > ir_toy module not to resubmit when irtoy_in_callback() got -EPROTO error. > > > We should do similar thing for imon. > > > > > > Basically, I think that imon should refrain from resubmitting urb when > > > callback function got an error. But since I don't know which error codes > > > should retry resubmitting urb, this patch handles only union of error codes > > > chosen from modules in drivers/media/rc/ directory which handles -EPROTO > > > error (i.e. ir_toy, mceusb and igorplugusb). > > In theory it's okay to resubmit _if_ the driver has a robust > error-recovery scheme (such as giving up after some fixed limit on the > number of errors or after some fixed time has elapsed, perhaps with a > time delay to prevent a flood of errors). Most drivers don't bother to > do this; they simply give up right away. This makes them more > vulnerable to short-term noise interference during USB transfers, but in > reality such interference is quite rare. There's nothing really wrong > with giving up right away. > > As to which error codes drivers should pay attention to... In most > cases they only look at -EPROTO. According to > Documentation/driver-api/usb/error-codes.rst, -EILSEQ and -ETIME are > also possible errors when a device has been unplugged, so it wouldn't > hurt to check for them too. But most host controller drivers don't > bother to issue them; -EPROTO is by far the most common error code > following an unplug. Thank you for explaining that, very helpful. Would it be useful to have this in the USB completion handler documentation? > > > We need to decide whether to call usb_unlink_urb() when we got -EPROTO > > > error. ir_toy and mceusb call usb_unlink_urb() but igorplugusb does not > > > due to commit 5e4029056263 ("media: igorplugusb: remove superfluous > > > usb_unlink_urb()"). This patch calls usb_unlink_urb() because description > > > of usb_unlink_urb() suggests that it is OK to call. > > If the error occurred because the device was unplugged then unlinking > the outstanding URBs isn't necessary; the USB core will unlink them for > you after the device's parent hub reports that the unplug took place. Are you saying there is a case when usb_unlink_urb() is necessary: if the device was not unplugged and -EPROTO is reported? > > > Second problem is that when usb_rx_callback_intf0() once got -EPROTO error > > > before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always > > > resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge > > > hardware after early callbacks"). If some errors should stop resubmitting > > > urb regardless of whether configuring the hardware has completed or not, > > > what that commit is doing is wrong. The ictx->dev_present_intf0 test was > > > introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes > > > until intf configured"), but that commit did not call usb_unlink_urb() > > > when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0 > > > test to immediately before imon_incoming_packet() so that we can call > > > usb_unlink_urb() as needed, or the first problem explained above happens > > > without printk() flooding (i.e. hung task). > > It seems odd for a driver to set up normal communications with a device > before the device has been configured, but of course that decision is up > to the creators and maintainers of the driver. The usb device has two interfaces, and we need both of them before we can do anything useful. Badly designed hardware. I think that is why this driver code is so awkward. Sean
On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: > Hi Alan, > > On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: > > On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: > > > [loop Alan in] > > > > I assume you're interested in the question of when to avoid resubmitting > > URBs. > > In theory it's okay to resubmit _if_ the driver has a robust > > error-recovery scheme (such as giving up after some fixed limit on the > > number of errors or after some fixed time has elapsed, perhaps with a > > time delay to prevent a flood of errors). Most drivers don't bother to > > do this; they simply give up right away. This makes them more > > vulnerable to short-term noise interference during USB transfers, but in > > reality such interference is quite rare. There's nothing really wrong > > with giving up right away. > > > > As to which error codes drivers should pay attention to... In most > > cases they only look at -EPROTO. According to > > Documentation/driver-api/usb/error-codes.rst, -EILSEQ and -ETIME are > > also possible errors when a device has been unplugged, so it wouldn't > > hurt to check for them too. But most host controller drivers don't > > bother to issue them; -EPROTO is by far the most common error code > > following an unplug. > > Thank you for explaining that, very helpful. Would it be useful to have > this in the USB completion handler documentation? I don't know what USB completion handler documentation you're talking about. Is it something in the Documentation/ directory? If it is then it should already include or refer to error-codes.rst. Or perhaps you're talking about the kerneldoc for this particular completion handler? There's no reason for that to include all the material that's already in error-codes.rst. But you might put a comment in the code at the point where -EPROTO errors are handled, explaining that they generally indicate that the device has been unplugged. > > If the error occurred because the device was unplugged then unlinking > > the outstanding URBs isn't necessary; the USB core will unlink them for > > you after the device's parent hub reports that the unplug took place. > > Are you saying there is a case when usb_unlink_urb() is necessary: if the > device was not unplugged and -EPROTO is reported? That depends on the driver. If it wants to cancel other outstanding URBs merely because one URB got a -EPROTO error but the device wasn't unplugged, then it has to call usb_unlink_urb() or something equivalent. Otherwise it will have to wait for those other URBs to complete in the usual way. (Of course, when the -EPROTO error code shows up in the completion handler, the driver doesn't know yet whether the device has been unplugged...) > > > > Second problem is that when usb_rx_callback_intf0() once got -EPROTO error > > > > before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always > > > > resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge > > > > hardware after early callbacks"). If some errors should stop resubmitting > > > > urb regardless of whether configuring the hardware has completed or not, > > > > what that commit is doing is wrong. The ictx->dev_present_intf0 test was > > > > introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes > > > > until intf configured"), but that commit did not call usb_unlink_urb() > > > > when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0 > > > > test to immediately before imon_incoming_packet() so that we can call > > > > usb_unlink_urb() as needed, or the first problem explained above happens > > > > without printk() flooding (i.e. hung task). > > > > It seems odd for a driver to set up normal communications with a device > > before the device has been configured, but of course that decision is up > > to the creators and maintainers of the driver. > > The usb device has two interfaces, and we need both of them before we can > do anything useful. Badly designed hardware. > > I think that is why this driver code is so awkward. That's what usb_driver_claim_interface() is for. IIRC, the cdc-acm driver uses it in exactly this way. Alan Stern
On Tue, Jul 15, 2025 at 09:30:02PM -0400, Alan Stern wrote: > On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: > > Hi Alan, > > > > On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: > > > On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: > > > > [loop Alan in] > > > > > > I assume you're interested in the question of when to avoid resubmitting > > > URBs. > > > > In theory it's okay to resubmit _if_ the driver has a robust > > > error-recovery scheme (such as giving up after some fixed limit on the > > > number of errors or after some fixed time has elapsed, perhaps with a > > > time delay to prevent a flood of errors). Most drivers don't bother to > > > do this; they simply give up right away. This makes them more > > > vulnerable to short-term noise interference during USB transfers, but in > > > reality such interference is quite rare. There's nothing really wrong > > > with giving up right away. > > > > > > As to which error codes drivers should pay attention to... In most > > > cases they only look at -EPROTO. According to > > > Documentation/driver-api/usb/error-codes.rst, -EILSEQ and -ETIME are > > > also possible errors when a device has been unplugged, so it wouldn't > > > hurt to check for them too. But most host controller drivers don't > > > bother to issue them; -EPROTO is by far the most common error code > > > following an unplug. > > > > Thank you for explaining that, very helpful. Would it be useful to have > > this in the USB completion handler documentation? > > I don't know what USB completion handler documentation you're talking > about. Is it something in the Documentation/ directory? If it is then > it should already include or refer to error-codes.rst. I can't see anything in error-codes.rst or URB.rst about the possibility of retrying after -EPROTO errors or how the callback should respond if it wants to give up. USB drivers seem to do all manner of different things. > > I think that is why this driver code is so awkward. > > That's what usb_driver_claim_interface() is for. IIRC, the cdc-acm > driver uses it in exactly this way. Very interesting, we should look at re-writing this driver. Note this function is not documented in Documentation/driver-api/usb/ Thank you for your help Sean
On Wed, Jul 16, 2025 at 10:38:23AM +0100, Sean Young wrote: > On Tue, Jul 15, 2025 at 09:30:02PM -0400, Alan Stern wrote: > > On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: > > > Hi Alan, > > > > > > On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: > > > > On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: > > > > > [loop Alan in] > > > > > > > > I assume you're interested in the question of when to avoid resubmitting > > > > URBs. > > > > > > In theory it's okay to resubmit _if_ the driver has a robust > > > > error-recovery scheme (such as giving up after some fixed limit on the > > > > number of errors or after some fixed time has elapsed, perhaps with a > > > > time delay to prevent a flood of errors). Most drivers don't bother to > > > > do this; they simply give up right away. This makes them more > > > > vulnerable to short-term noise interference during USB transfers, but in > > > > reality such interference is quite rare. There's nothing really wrong > > > > with giving up right away. > > > > > > > > As to which error codes drivers should pay attention to... In most > > > > cases they only look at -EPROTO. According to > > > > Documentation/driver-api/usb/error-codes.rst, -EILSEQ and -ETIME are > > > > also possible errors when a device has been unplugged, so it wouldn't > > > > hurt to check for them too. But most host controller drivers don't > > > > bother to issue them; -EPROTO is by far the most common error code > > > > following an unplug. > > > > > > Thank you for explaining that, very helpful. Would it be useful to have > > > this in the USB completion handler documentation? > > > > I don't know what USB completion handler documentation you're talking > > about. Is it something in the Documentation/ directory? If it is then > > it should already include or refer to error-codes.rst. > > I can't see anything in error-codes.rst or URB.rst about the possibility > of retrying after -EPROTO errors or how the callback should respond if > it wants to give up. USB drivers seem to do all manner of different things. error-codes.rst is meant merely to explain the meanings of the different error codes. How to deal with them when they occur is not in its scope. Drivers behave in different ways because they were written by different people and serve different purposes. For example, data loss in a mass-storage driver would be a lot more serious than data loss in a mouse driver, so of course the two drivers don't use the same amount of care when recovering from errors. As for how a callback should be behave if it wants to give up, that depends on how the driver is designed. There is no one single answer appropriate for all drivers. In the simplest case, where the driver always keeps one URB in flight and resubmits the URB whenever it completes, giving up is easy -- just don't resubmit the URB! This will immediately end all the communication with the device. > > > I think that is why this driver code is so awkward. > > > > That's what usb_driver_claim_interface() is for. IIRC, the cdc-acm > > driver uses it in exactly this way. > > Very interesting, we should look at re-writing this driver. Note this > function is not documented in Documentation/driver-api/usb/ The documentation files are quite old and were never complete. Nowadays we rely much more heavily on the kerneldoc in the source code itself. > Thank you for your help You're welcome. Alan Stern
On 2025/07/16 18:38, Sean Young wrote: > On Tue, Jul 15, 2025 at 09:30:02PM -0400, Alan Stern wrote: >> On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: >>> Hi Alan, >>> >>> On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: >>>> On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: >>>>> [loop Alan in] >>>> >>>> I assume you're interested in the question of when to avoid resubmitting >>>> URBs. I think that what Hillf wanted to know (and I wanted maintainers of this driver to respond) is whether timeout of 10 seconds is reasonable - /* Wait for transmission to complete (or abort) */ - retval = wait_for_completion_interruptible( - &ictx->tx.finished); - if (retval) { + /* Wait for transmission to complete (or abort or timeout) */ + retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ); because the reproducer for this problem sometimes prevents usb_rx_callback_intf0() from being called. Unless we somehow handle such situation, the hung task reports won't go away. >>> I think that is why this driver code is so awkward. >> >> That's what usb_driver_claim_interface() is for. IIRC, the cdc-acm >> driver uses it in exactly this way. > > Very interesting, we should look at re-writing this driver. Note this > function is not documented in Documentation/driver-api/usb/ OK. Then, what do you want to do for this syzbot report? If you want to apply this patch, I'll send an updated patch with Alan's comment. If you want to directly rewrite this module, this patch will be discarded.
On Wed, Jul 16, 2025 at 07:09:51PM +0900, Tetsuo Handa wrote: > On 2025/07/16 18:38, Sean Young wrote: > > On Tue, Jul 15, 2025 at 09:30:02PM -0400, Alan Stern wrote: > >> On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: > >>> I think that is why this driver code is so awkward. > >> > >> That's what usb_driver_claim_interface() is for. IIRC, the cdc-acm > >> driver uses it in exactly this way. > > > > Very interesting, we should look at re-writing this driver. Note this > > function is not documented in Documentation/driver-api/usb/ > > OK. Then, what do you want to do for this syzbot report? > If you want to apply this patch, I'll send an updated patch with Alan's comment. > If you want to directly rewrite this module, this patch will be discarded. Let's apply your updated patch. It looks good. I've started looking at re-writing the driver to use usb_driver_claim_interface(), but I don't know when that will be ready (or if it'll work). Thanks, Sean
syzbot is reporting that imon has three problems which result in
hung tasks due to forever holding device lock [1].
First problem is that when usb_rx_callback_intf0() once got -EPROTO error
after ictx->dev_present_intf0 became true, usb_rx_callback_intf0()
resubmits urb after printk(), and resubmitted urb causes
usb_rx_callback_intf0() to again get -EPROTO error. This results in
printk() flooding (RCU stalls).
Alan Stern commented [2] that
In theory it's okay to resubmit _if_ the driver has a robust
error-recovery scheme (such as giving up after some fixed limit on the
number of errors or after some fixed time has elapsed, perhaps with a
time delay to prevent a flood of errors). Most drivers don't bother to
do this; they simply give up right away. This makes them more
vulnerable to short-term noise interference during USB transfers, but in
reality such interference is quite rare. There's nothing really wrong
with giving up right away.
but imon has a poor error-recovery scheme which just retries forever;
this behavior should be fixed.
Since I'm not sure whether it is safe for imon users to give up upon any
error code, this patch takes care of only union of error codes chosen from
modules in drivers/media/rc/ directory which handle -EPROTO error (i.e.
ir_toy, mceusb and igorplugusb).
Second problem is that when usb_rx_callback_intf0() once got -EPROTO error
before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always
resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge
hardware after early callbacks"). The ictx->dev_present_intf0 test was
introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes
until intf configured"), but that commit did not call usb_unlink_urb()
when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0
test to immediately before imon_incoming_packet() so that we can call
usb_unlink_urb() as needed, or the first problem explained above happens
without printk() flooding (i.e. hung task).
Third problem is that when usb_rx_callback_intf0() is not called for some
reason (e.g. flaky hardware; the reproducer for this problem sometimes
prevents usb_rx_callback_intf0() from being called),
wait_for_completion_interruptible() in send_packet() never returns (i.e.
hung task). As a workaround for such situation, change send_packet() to
wait for completion with timeout of 10 seconds.
Also, move mutex_trylock() in imon_ir_change_protocol() to the beginning,
for memcpy() which modifies ictx->usb_tx_buf should be protected by
ictx->lock.
Also, verify at the beginning of send_packet() that ictx->lock is held
in case send_packet() is by error called from imon_ir_change_protocol()
when mutex_trylock() failed due to concurrent requests.
Link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a [1]
Link: https://lkml.kernel.org/r/d6da6709-d799-4be3-a695-850bddd6eb24@rowland.harvard.edu [2]
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
Changes in v2:
Updated patch description.
drivers/media/rc/imon.c | 69 +++++++++++++++++++++++++----------------
1 file changed, 42 insertions(+), 27 deletions(-)
diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c
index f5221b018808..3469a401a572 100644
--- a/drivers/media/rc/imon.c
+++ b/drivers/media/rc/imon.c
@@ -598,6 +598,8 @@ static int send_packet(struct imon_context *ictx)
int retval = 0;
struct usb_ctrlrequest *control_req = NULL;
+ lockdep_assert_held(&ictx->lock);
+
/* Check if we need to use control or interrupt urb */
if (!ictx->tx_control) {
pipe = usb_sndintpipe(ictx->usbdev_intf0,
@@ -645,12 +647,15 @@ static int send_packet(struct imon_context *ictx)
smp_rmb(); /* ensure later readers know we're not busy */
pr_err_ratelimited("error submitting urb(%d)\n", retval);
} else {
- /* Wait for transmission to complete (or abort) */
- retval = wait_for_completion_interruptible(
- &ictx->tx.finished);
- if (retval) {
+ /* Wait for transmission to complete (or abort or timeout) */
+ retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ);
+ if (retval <= 0) {
usb_kill_urb(ictx->tx_urb);
pr_err_ratelimited("task interrupted\n");
+ if (retval < 0)
+ ictx->tx.status = retval;
+ else
+ ictx->tx.status = -ETIMEDOUT;
}
ictx->tx.busy = false;
@@ -1121,7 +1126,7 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
int retval;
struct imon_context *ictx = rc->priv;
struct device *dev = ictx->dev;
- bool unlock = false;
+ const bool unlock = mutex_trylock(&ictx->lock);
unsigned char ir_proto_packet[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86 };
@@ -1148,8 +1153,6 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet));
- unlock = mutex_trylock(&ictx->lock);
-
retval = send_packet(ictx);
if (retval)
goto out;
@@ -1745,14 +1748,6 @@ static void usb_rx_callback_intf0(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf0)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1761,16 +1756,30 @@ static void usb_rx_callback_intf0(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf0)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ usb_unlink_urb(urb);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf0, GFP_ATOMIC);
}
@@ -1786,14 +1795,6 @@ static void usb_rx_callback_intf1(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf1)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1802,16 +1803,30 @@ static void usb_rx_callback_intf1(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf1)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ usb_unlink_urb(urb);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf1, GFP_ATOMIC);
}
--
2.50.1
On Wed, Jul 16, 2025 at 11:07:17PM +0900, Tetsuo Handa wrote: > syzbot is reporting that imon has three problems which result in > hung tasks due to forever holding device lock [1]. > > First problem is that when usb_rx_callback_intf0() once got -EPROTO error > after ictx->dev_present_intf0 became true, usb_rx_callback_intf0() > resubmits urb after printk(), and resubmitted urb causes > usb_rx_callback_intf0() to again get -EPROTO error. This results in > printk() flooding (RCU stalls). > > Alan Stern commented [2] that > > In theory it's okay to resubmit _if_ the driver has a robust > error-recovery scheme (such as giving up after some fixed limit on the > number of errors or after some fixed time has elapsed, perhaps with a > time delay to prevent a flood of errors). Most drivers don't bother to > do this; they simply give up right away. This makes them more > vulnerable to short-term noise interference during USB transfers, but in > reality such interference is quite rare. There's nothing really wrong > with giving up right away. > > but imon has a poor error-recovery scheme which just retries forever; > this behavior should be fixed. > > Since I'm not sure whether it is safe for imon users to give up upon any > error code, this patch takes care of only union of error codes chosen from > modules in drivers/media/rc/ directory which handle -EPROTO error (i.e. > ir_toy, mceusb and igorplugusb). > > Second problem is that when usb_rx_callback_intf0() once got -EPROTO error > before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always > resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge > hardware after early callbacks"). The ictx->dev_present_intf0 test was > introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes > until intf configured"), but that commit did not call usb_unlink_urb() > when usb_rx_callback_intf0() got an error. Move the ictx->dev_present_intf0 > test to immediately before imon_incoming_packet() so that we can call > usb_unlink_urb() as needed, or the first problem explained above happens > without printk() flooding (i.e. hung task). > > Third problem is that when usb_rx_callback_intf0() is not called for some > reason (e.g. flaky hardware; the reproducer for this problem sometimes > prevents usb_rx_callback_intf0() from being called), > wait_for_completion_interruptible() in send_packet() never returns (i.e. > hung task). As a workaround for such situation, change send_packet() to > wait for completion with timeout of 10 seconds. > > Also, move mutex_trylock() in imon_ir_change_protocol() to the beginning, > for memcpy() which modifies ictx->usb_tx_buf should be protected by > ictx->lock. > > Also, verify at the beginning of send_packet() that ictx->lock is held > in case send_packet() is by error called from imon_ir_change_protocol() > when mutex_trylock() failed due to concurrent requests. > > Link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a [1] > Link: https://lkml.kernel.org/r/d6da6709-d799-4be3-a695-850bddd6eb24@rowland.harvard.edu [2] > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> > --- Making multiple significant changes like these in a single patch is generally a bad idea. It would be better to split this up into two or three patches, each doing one thing. > Changes in v2: > Updated patch description. > > drivers/media/rc/imon.c | 69 +++++++++++++++++++++++++---------------- > 1 file changed, 42 insertions(+), 27 deletions(-) > > diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c > index f5221b018808..3469a401a572 100644 > --- a/drivers/media/rc/imon.c > +++ b/drivers/media/rc/imon.c > @@ -1761,16 +1756,30 @@ static void usb_rx_callback_intf0(struct urb *urb) > break; > > case 0: > - imon_incoming_packet(ictx, urb, intfnum); > + /* > + * if we get a callback before we're done configuring the hardware, we > + * can't yet process the data, as there's nowhere to send it, but we > + * still need to submit a new rx URB to avoid wedging the hardware > + */ > + if (ictx->dev_present_intf0) > + imon_incoming_packet(ictx, urb, intfnum); > break; > > + case -ECONNRESET: > + case -EILSEQ: > + case -EPROTO: > + case -EPIPE: > + dev_warn(ictx->dev, "imon %s: status(%d)\n", > + __func__, urb->status); > + usb_unlink_urb(urb); The URB you're unlinking here is the one that just completed, right? Which means it's already unlinked, so this call is unnecessary. > @@ -1802,16 +1803,30 @@ static void usb_rx_callback_intf1(struct urb *urb) > break; > > case 0: > - imon_incoming_packet(ictx, urb, intfnum); > + /* > + * if we get a callback before we're done configuring the hardware, we > + * can't yet process the data, as there's nowhere to send it, but we > + * still need to submit a new rx URB to avoid wedging the hardware > + */ > + if (ictx->dev_present_intf1) > + imon_incoming_packet(ictx, urb, intfnum); > break; > > + case -ECONNRESET: > + case -EILSEQ: > + case -EPROTO: > + case -EPIPE: > + dev_warn(ictx->dev, "imon %s: status(%d)\n", > + __func__, urb->status); > + usb_unlink_urb(urb); > + return; Same here. Alan Stern
syzbot is reporting that imon has three problems which result in
hung tasks due to forever holding device lock [1].
First problem is that when usb_rx_callback_intf0() once got -EPROTO error
after ictx->dev_present_intf0 became true, usb_rx_callback_intf0()
resubmits urb after printk(), and resubmitted urb causes
usb_rx_callback_intf0() to again get -EPROTO error. This results in
printk() flooding (RCU stalls).
Alan Stern commented [2] that
In theory it's okay to resubmit _if_ the driver has a robust
error-recovery scheme (such as giving up after some fixed limit on the
number of errors or after some fixed time has elapsed, perhaps with a
time delay to prevent a flood of errors). Most drivers don't bother to
do this; they simply give up right away. This makes them more
vulnerable to short-term noise interference during USB transfers, but in
reality such interference is quite rare. There's nothing really wrong
with giving up right away.
but imon has a poor error-recovery scheme which just retries forever;
this behavior should be fixed.
Since I'm not sure whether it is safe for imon users to give up upon any
error code, this patch takes care of only union of error codes chosen from
modules in drivers/media/rc/ directory which handle -EPROTO error (i.e.
ir_toy, mceusb and igorplugusb).
Second problem is that when usb_rx_callback_intf0() once got -EPROTO error
before ictx->dev_present_intf0 becomes true, usb_rx_callback_intf0() always
resubmits urb due to commit 8791d63af0cf ("[media] imon: don't wedge
hardware after early callbacks"). Move the ictx->dev_present_intf0 test
introduced by commit 6f6b90c9231a ("[media] imon: don't parse scancodes
until intf configured") to immediately before imon_incoming_packet(), or
the first problem explained above happens without printk() flooding (i.e.
hung task).
Third problem is that when usb_rx_callback_intf0() is not called for some
reason (e.g. flaky hardware; the reproducer for this problem sometimes
prevents usb_rx_callback_intf0() from being called),
wait_for_completion_interruptible() in send_packet() never returns (i.e.
hung task). As a workaround for such situation, change send_packet() to
wait for completion with timeout of 10 seconds.
Link: https://syzkaller.appspot.com/bug?extid=592e2ab8775dbe0bf09a [1]
Link: https://lkml.kernel.org/r/d6da6709-d799-4be3-a695-850bddd6eb24@rowland.harvard.edu [2]
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
Changes in v3:
Dropped usb_unlink_urb() call from usb_rx_callback_intf{0,1}().
Dropped ictx->lock change for sending as a separate patch.
Changes in v2:
Updated patch description.
drivers/media/rc/imon.c | 61 +++++++++++++++++++++++++----------------
1 file changed, 37 insertions(+), 24 deletions(-)
diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c
index f5221b018808..b914dd39c21c 100644
--- a/drivers/media/rc/imon.c
+++ b/drivers/media/rc/imon.c
@@ -645,12 +645,15 @@ static int send_packet(struct imon_context *ictx)
smp_rmb(); /* ensure later readers know we're not busy */
pr_err_ratelimited("error submitting urb(%d)\n", retval);
} else {
- /* Wait for transmission to complete (or abort) */
- retval = wait_for_completion_interruptible(
- &ictx->tx.finished);
- if (retval) {
+ /* Wait for transmission to complete (or abort or timeout) */
+ retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ);
+ if (retval <= 0) {
usb_kill_urb(ictx->tx_urb);
pr_err_ratelimited("task interrupted\n");
+ if (retval < 0)
+ ictx->tx.status = retval;
+ else
+ ictx->tx.status = -ETIMEDOUT;
}
ictx->tx.busy = false;
@@ -1745,14 +1748,6 @@ static void usb_rx_callback_intf0(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf0)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1761,16 +1756,29 @@ static void usb_rx_callback_intf0(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf0)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf0, GFP_ATOMIC);
}
@@ -1786,14 +1794,6 @@ static void usb_rx_callback_intf1(struct urb *urb)
if (!ictx)
return;
- /*
- * if we get a callback before we're done configuring the hardware, we
- * can't yet process the data, as there's nowhere to send it, but we
- * still need to submit a new rx URB to avoid wedging the hardware
- */
- if (!ictx->dev_present_intf1)
- goto out;
-
switch (urb->status) {
case -ENOENT: /* usbcore unlink successful! */
return;
@@ -1802,16 +1802,29 @@ static void usb_rx_callback_intf1(struct urb *urb)
break;
case 0:
- imon_incoming_packet(ictx, urb, intfnum);
+ /*
+ * if we get a callback before we're done configuring the hardware, we
+ * can't yet process the data, as there's nowhere to send it, but we
+ * still need to submit a new rx URB to avoid wedging the hardware
+ */
+ if (ictx->dev_present_intf1)
+ imon_incoming_packet(ictx, urb, intfnum);
break;
+ case -ECONNRESET:
+ case -EILSEQ:
+ case -EPROTO:
+ case -EPIPE:
+ dev_warn(ictx->dev, "imon %s: status(%d)\n",
+ __func__, urb->status);
+ return;
+
default:
dev_warn(ictx->dev, "imon %s: status(%d): ignored\n",
__func__, urb->status);
break;
}
-out:
usb_submit_urb(ictx->rx_urb_intf1, GFP_ATOMIC);
}
--
2.50.1
On Wed, 16 Jul 2025 19:09:51 +0900 Tetsuo Handa wrote: >On 2025/07/16 18:38, Sean Young wrote: >> On Tue, Jul 15, 2025 at 09:30:02PM -0400, Alan Stern wrote: >>> On Tue, Jul 15, 2025 at 09:19:18PM +0100, Sean Young wrote: >>>> Hi Alan, >>>> >>>> On Sun, Jul 13, 2025 at 11:21:24AM -0400, Alan Stern wrote: >>>>> On Sun, Jul 13, 2025 at 04:11:47PM +0800, Hillf Danton wrote: >>>>>> [loop Alan in] >>>>> >>>>> I assume you're interested in the question of when to avoid resubmitting >>>>> URBs. > > I think that what Hillf wanted to know (and I wanted maintainers of this > driver to respond) is whether timeout of 10 seconds is reasonable > Yes. In product environments like car cockpit I have option like change to BOM if urb 10s timedout in general could be reliably reproduced twice a month for example. > - /* Wait for transmission to complete (or abort) */ > - retval = wait_for_completion_interruptible( > - &ictx->tx.finished); > - if (retval) { > + /* Wait for transmission to complete (or abort or timeout) */ > + retval = wait_for_completion_interruptible_timeout(&ictx->tx.finished, 10 * HZ); > > because the reproducer for this problem sometimes prevents > usb_rx_callback_intf0() from being called. Unless we somehow > handle such situation, the hung task reports won't go away. >
© 2016 - 2025 Red Hat, Inc.