[PATCH] mailbox: Fix NULL message support in mbox_send_message()

jassisinghbrar@gmail.com posted 1 patch 5 days, 14 hours ago
drivers/mailbox/mailbox.c   | 15 ++++++++-------
drivers/mailbox/mailbox.h   |  3 +++
drivers/mailbox/tegra-hsp.c |  2 +-
3 files changed, 12 insertions(+), 8 deletions(-)
[PATCH] mailbox: Fix NULL message support in mbox_send_message()
Posted by jassisinghbrar@gmail.com 5 days, 14 hours ago
From: Jassi Brar <jassisinghbrar@gmail.com>

The active_req field serves double duty as both the "is a TX in
flight" flag (NULL means idle) and the storage for the in-flight
message pointer. When a client sends NULL via mbox_send_message(),
active_req is set to NULL, which the framework misinterprets as
"no active request". This breaks the TX state machine by:

 - tx_tick() short-circuits on (!mssg), skipping the tx_done
   callback and the tx_complete completion
 - txdone_hrtimer() skips the channel entirely since active_req
   is NULL, so poll-based TX-done detection never fires.

Fix this by introducing a MBOX_NO_MSG sentinel value that means
"no active request," freeing NULL to be valid message data. The
sentinel is defined in the subsystem-internal mailbox.h so that
controller drivers within drivers/mailbox/ can reference it, but
it is not exposed to clients outside the subsystem.

Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm,
Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were
audited for regression:

 - Most already work around the bug via knows_txdone=true with a
   manual mbox_client_txdone() call, making the framework's
   tracking irrelevant. These are unaffected.

 - Poll-based callers (Xilinx zynqmp/r5) are strictly better off:
   the poll timer now correctly detects NULL-active channels
   instead of silently skipping them.

 - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm
   caller that omitted the knows_txdone + mbox_client_txdone()
   pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix
   missing mailbox TX done acknowledgment").

 - No caller sets both a tx_done callback and sends NULL, nor
   combines tx_block=true with NULL sends, so the newly reachable
   callback/completion paths are never exercised.

Also update tegra-hsp's flush callback, which directly inspects
active_req to wait for the channel to drain: the old "!= NULL"
check becomes "!= MBOX_NO_MSG", otherwise flush spins until
timeout since the sentinel is non-NULL.

The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message
by clients.

Reported-by: Joonwon Kang <joonwonkang@google.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/mailbox.c   | 15 ++++++++-------
 drivers/mailbox/mailbox.h   |  3 +++
 drivers/mailbox/tegra-hsp.c |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 617ba505691d..9622369cab66 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -52,7 +52,7 @@ static void msg_submit(struct mbox_chan *chan)
 	int err = -EBUSY;
 
 	scoped_guard(spinlock_irqsave, &chan->lock) {
-		if (!chan->msg_count || chan->active_req)
+		if (!chan->msg_count || chan->active_req != MBOX_NO_MSG)
 			break;
 
 		count = chan->msg_count;
@@ -87,13 +87,13 @@ static void tx_tick(struct mbox_chan *chan, int r)
 
 	scoped_guard(spinlock_irqsave, &chan->lock) {
 		mssg = chan->active_req;
-		chan->active_req = NULL;
+		chan->active_req = MBOX_NO_MSG;
 	}
 
 	/* Submit next message */
 	msg_submit(chan);
 
-	if (!mssg)
+	if (mssg == MBOX_NO_MSG)
 		return;
 
 	/* Notify the client */
@@ -114,7 +114,7 @@ static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer)
 	for (i = 0; i < mbox->num_chans; i++) {
 		struct mbox_chan *chan = &mbox->chans[i];
 
-		if (chan->active_req && chan->cl) {
+		if (chan->active_req != MBOX_NO_MSG && chan->cl) {
 			txdone = chan->mbox->ops->last_tx_done(chan);
 			if (txdone)
 				tx_tick(chan, 0);
@@ -246,7 +246,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
 {
 	int t;
 
-	if (!chan || !chan->cl)
+	if (!chan || !chan->cl || mssg == MBOX_NO_MSG)
 		return -EINVAL;
 
 	t = add_to_rbuf(chan, mssg);
@@ -319,7 +319,7 @@ static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl)
 	scoped_guard(spinlock_irqsave, &chan->lock) {
 		chan->msg_free = 0;
 		chan->msg_count = 0;
-		chan->active_req = NULL;
+		chan->active_req = MBOX_NO_MSG;
 		chan->cl = cl;
 		init_completion(&chan->tx_complete);
 
@@ -477,7 +477,7 @@ void mbox_free_channel(struct mbox_chan *chan)
 	/* The queued TX requests are simply aborted, no callbacks are made */
 	scoped_guard(spinlock_irqsave, &chan->lock) {
 		chan->cl = NULL;
-		chan->active_req = NULL;
+		chan->active_req = MBOX_NO_MSG;
 		if (chan->txdone_method == TXDONE_BY_ACK)
 			chan->txdone_method = TXDONE_BY_POLL;
 	}
@@ -532,6 +532,7 @@ int mbox_controller_register(struct mbox_controller *mbox)
 
 		chan->cl = NULL;
 		chan->mbox = mbox;
+		chan->active_req = MBOX_NO_MSG;
 		chan->txdone_method = txdone;
 		spin_lock_init(&chan->lock);
 	}
diff --git a/drivers/mailbox/mailbox.h b/drivers/mailbox/mailbox.h
index e1ec4efab693..c77dd6fc5b8a 100644
--- a/drivers/mailbox/mailbox.h
+++ b/drivers/mailbox/mailbox.h
@@ -5,6 +5,9 @@
 
 #include <linux/bits.h>
 
+/* Sentinel value distinguishing "no active request" from "NULL message data" */
+#define MBOX_NO_MSG	((void *)-1)
+
 #define TXDONE_BY_IRQ	BIT(0) /* controller has remote RTR irq */
 #define TXDONE_BY_POLL	BIT(1) /* controller can read status of last TX */
 #define TXDONE_BY_ACK	BIT(2) /* S/W ACK received by Client ticks the TX */
diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c
index ed9a0bb2bcd8..7991e8dba579 100644
--- a/drivers/mailbox/tegra-hsp.c
+++ b/drivers/mailbox/tegra-hsp.c
@@ -497,7 +497,7 @@ static int tegra_hsp_mailbox_flush(struct mbox_chan *chan,
 			mbox_chan_txdone(chan, 0);
 
 			/* Wait until channel is empty */
-			if (chan->active_req != NULL)
+			if (chan->active_req != MBOX_NO_MSG)
 				continue;
 
 			return 0;
-- 
2.52.0
Re: [PATCH] mailbox: Fix NULL message support in mbox_send_message()
Posted by Joonwon Kang 2 days, 2 hours ago
> The active_req field serves double duty as both the "is a TX in
> flight" flag (NULL means idle) and the storage for the in-flight
> message pointer. When a client sends NULL via mbox_send_message(),
> active_req is set to NULL, which the framework misinterprets as
> "no active request". This breaks the TX state machine by:
> 
>  - tx_tick() short-circuits on (!mssg), skipping the tx_done
>    callback and the tx_complete completion
>  - txdone_hrtimer() skips the channel entirely since active_req
>    is NULL, so poll-based TX-done detection never fires.
> 
> Fix this by introducing a MBOX_NO_MSG sentinel value that means
> "no active request," freeing NULL to be valid message data. The
> sentinel is defined in the subsystem-internal mailbox.h so that
> controller drivers within drivers/mailbox/ can reference it, but
> it is not exposed to clients outside the subsystem.
> 
> Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm,
> Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were
> audited for regression:
> 
>  - Most already work around the bug via knows_txdone=true with a
>    manual mbox_client_txdone() call, making the framework's
>    tracking irrelevant. These are unaffected.
> 
>  - Poll-based callers (Xilinx zynqmp/r5) are strictly better off:
>    the poll timer now correctly detects NULL-active channels
>    instead of silently skipping them.
> 
>  - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm
>    caller that omitted the knows_txdone + mbox_client_txdone()
>    pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix
>    missing mailbox TX done acknowledgment").
> 
>  - No caller sets both a tx_done callback and sends NULL, nor
>    combines tx_block=true with NULL sends, so the newly reachable
>    callback/completion paths are never exercised.
> 
> Also update tegra-hsp's flush callback, which directly inspects
> active_req to wait for the channel to drain: the old "!= NULL"
> check becomes "!= MBOX_NO_MSG", otherwise flush spins until
> timeout since the sentinel is non-NULL.
> 
> The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message
> by clients.
> 
> Reported-by: Joonwon Kang <joonwonkang@google.com>
> Reviewed-by: Douglas Anderson <dianders@chromium.org>
> Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>

Do you have plans to backport this patch to other stable versions?
If not, I can send the backport for you to the stable versions that are in my needs.

Thanks,
Joonwon Kang
Re: [PATCH] mailbox: Fix NULL message support in mbox_send_message()
Posted by Jassi Brar 1 day, 7 hours ago
On Tue, Mar 31, 2026 at 5:08 AM Joonwon Kang <joonwonkang@google.com> wrote:
>
> > The active_req field serves double duty as both the "is a TX in
> > flight" flag (NULL means idle) and the storage for the in-flight
> > message pointer. When a client sends NULL via mbox_send_message(),
> > active_req is set to NULL, which the framework misinterprets as
> > "no active request". This breaks the TX state machine by:
> >
> >  - tx_tick() short-circuits on (!mssg), skipping the tx_done
> >    callback and the tx_complete completion
> >  - txdone_hrtimer() skips the channel entirely since active_req
> >    is NULL, so poll-based TX-done detection never fires.
> >
> > Fix this by introducing a MBOX_NO_MSG sentinel value that means
> > "no active request," freeing NULL to be valid message data. The
> > sentinel is defined in the subsystem-internal mailbox.h so that
> > controller drivers within drivers/mailbox/ can reference it, but
> > it is not exposed to clients outside the subsystem.
> >
> > Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm,
> > Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were
> > audited for regression:
> >
> >  - Most already work around the bug via knows_txdone=true with a
> >    manual mbox_client_txdone() call, making the framework's
> >    tracking irrelevant. These are unaffected.
> >
> >  - Poll-based callers (Xilinx zynqmp/r5) are strictly better off:
> >    the poll timer now correctly detects NULL-active channels
> >    instead of silently skipping them.
> >
> >  - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm
> >    caller that omitted the knows_txdone + mbox_client_txdone()
> >    pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix
> >    missing mailbox TX done acknowledgment").
> >
> >  - No caller sets both a tx_done callback and sends NULL, nor
> >    combines tx_block=true with NULL sends, so the newly reachable
> >    callback/completion paths are never exercised.
> >
> > Also update tegra-hsp's flush callback, which directly inspects
> > active_req to wait for the channel to drain: the old "!= NULL"
> > check becomes "!= MBOX_NO_MSG", otherwise flush spins until
> > timeout since the sentinel is non-NULL.
> >
> > The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message
> > by clients.
> >
> > Reported-by: Joonwon Kang <joonwonkang@google.com>
> > Reviewed-by: Douglas Anderson <dianders@chromium.org>
> > Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
>
> Do you have plans to backport this patch to other stable versions?
> If not, I can send the backport for you to the stable versions that are in my needs.
>
Please feel free to do so. Thanks for the help.
Re: [PATCH] mailbox: Fix NULL message support in mbox_send_message()
Posted by Jassi Brar 3 days, 20 hours ago
On Fri, Mar 27, 2026 at 5:00 PM <jassisinghbrar@gmail.com> wrote:
>
> From: Jassi Brar <jassisinghbrar@gmail.com>
>
> The active_req field serves double duty as both the "is a TX in
> flight" flag (NULL means idle) and the storage for the in-flight
> message pointer. When a client sends NULL via mbox_send_message(),
> active_req is set to NULL, which the framework misinterprets as
> "no active request". This breaks the TX state machine by:
>
>  - tx_tick() short-circuits on (!mssg), skipping the tx_done
>    callback and the tx_complete completion
>  - txdone_hrtimer() skips the channel entirely since active_req
>    is NULL, so poll-based TX-done detection never fires.
>
> Fix this by introducing a MBOX_NO_MSG sentinel value that means
> "no active request," freeing NULL to be valid message data. The
> sentinel is defined in the subsystem-internal mailbox.h so that
> controller drivers within drivers/mailbox/ can reference it, but
> it is not exposed to clients outside the subsystem.
>
> Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm,
> Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were
> audited for regression:
>
>  - Most already work around the bug via knows_txdone=true with a
>    manual mbox_client_txdone() call, making the framework's
>    tracking irrelevant. These are unaffected.
>
>  - Poll-based callers (Xilinx zynqmp/r5) are strictly better off:
>    the poll timer now correctly detects NULL-active channels
>    instead of silently skipping them.
>
>  - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm
>    caller that omitted the knows_txdone + mbox_client_txdone()
>    pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix
>    missing mailbox TX done acknowledgment").
>
>  - No caller sets both a tx_done callback and sends NULL, nor
>    combines tx_block=true with NULL sends, so the newly reachable
>    callback/completion paths are never exercised.
>
> Also update tegra-hsp's flush callback, which directly inspects
> active_req to wait for the channel to drain: the old "!= NULL"
> check becomes "!= MBOX_NO_MSG", otherwise flush spins until
> timeout since the sentinel is non-NULL.
>
> The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message
> by clients.
>
> Reported-by: Joonwon Kang <joonwonkang@google.com>
> Reviewed-by: Douglas Anderson <dianders@chromium.org>
> Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
> ---
>  drivers/mailbox/mailbox.c   | 15 ++++++++-------
>  drivers/mailbox/mailbox.h   |  3 +++
>  drivers/mailbox/tegra-hsp.c |  2 +-
>  3 files changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
> index 617ba505691d..9622369cab66 100644
> --- a/drivers/mailbox/mailbox.c
> +++ b/drivers/mailbox/mailbox.c
> @@ -52,7 +52,7 @@ static void msg_submit(struct mbox_chan *chan)
>         int err = -EBUSY;
>
>         scoped_guard(spinlock_irqsave, &chan->lock) {
> -               if (!chan->msg_count || chan->active_req)
> +               if (!chan->msg_count || chan->active_req != MBOX_NO_MSG)
>                         break;
>
>                 count = chan->msg_count;
> @@ -87,13 +87,13 @@ static void tx_tick(struct mbox_chan *chan, int r)
>
>         scoped_guard(spinlock_irqsave, &chan->lock) {
>                 mssg = chan->active_req;
> -               chan->active_req = NULL;
> +               chan->active_req = MBOX_NO_MSG;
>         }
>
>         /* Submit next message */
>         msg_submit(chan);
>
> -       if (!mssg)
> +       if (mssg == MBOX_NO_MSG)
>                 return;
>
>         /* Notify the client */
> @@ -114,7 +114,7 @@ static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer)
>         for (i = 0; i < mbox->num_chans; i++) {
>                 struct mbox_chan *chan = &mbox->chans[i];
>
> -               if (chan->active_req && chan->cl) {
> +               if (chan->active_req != MBOX_NO_MSG && chan->cl) {
>                         txdone = chan->mbox->ops->last_tx_done(chan);
>                         if (txdone)
>                                 tx_tick(chan, 0);
> @@ -246,7 +246,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
>  {
>         int t;
>
> -       if (!chan || !chan->cl)
> +       if (!chan || !chan->cl || mssg == MBOX_NO_MSG)
>                 return -EINVAL;
>
>         t = add_to_rbuf(chan, mssg);
> @@ -319,7 +319,7 @@ static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl)
>         scoped_guard(spinlock_irqsave, &chan->lock) {
>                 chan->msg_free = 0;
>                 chan->msg_count = 0;
> -               chan->active_req = NULL;
> +               chan->active_req = MBOX_NO_MSG;
>                 chan->cl = cl;
>                 init_completion(&chan->tx_complete);
>
> @@ -477,7 +477,7 @@ void mbox_free_channel(struct mbox_chan *chan)
>         /* The queued TX requests are simply aborted, no callbacks are made */
>         scoped_guard(spinlock_irqsave, &chan->lock) {
>                 chan->cl = NULL;
> -               chan->active_req = NULL;
> +               chan->active_req = MBOX_NO_MSG;
>                 if (chan->txdone_method == TXDONE_BY_ACK)
>                         chan->txdone_method = TXDONE_BY_POLL;
>         }
> @@ -532,6 +532,7 @@ int mbox_controller_register(struct mbox_controller *mbox)
>
>                 chan->cl = NULL;
>                 chan->mbox = mbox;
> +               chan->active_req = MBOX_NO_MSG;
>                 chan->txdone_method = txdone;
>                 spin_lock_init(&chan->lock);
>         }
> diff --git a/drivers/mailbox/mailbox.h b/drivers/mailbox/mailbox.h
> index e1ec4efab693..c77dd6fc5b8a 100644
> --- a/drivers/mailbox/mailbox.h
> +++ b/drivers/mailbox/mailbox.h
> @@ -5,6 +5,9 @@
>
>  #include <linux/bits.h>
>
> +/* Sentinel value distinguishing "no active request" from "NULL message data" */
> +#define MBOX_NO_MSG    ((void *)-1)
> +
>  #define TXDONE_BY_IRQ  BIT(0) /* controller has remote RTR irq */
>  #define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */
>  #define TXDONE_BY_ACK  BIT(2) /* S/W ACK received by Client ticks the TX */
> diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c
> index ed9a0bb2bcd8..7991e8dba579 100644
> --- a/drivers/mailbox/tegra-hsp.c
> +++ b/drivers/mailbox/tegra-hsp.c
> @@ -497,7 +497,7 @@ static int tegra_hsp_mailbox_flush(struct mbox_chan *chan,
>                         mbox_chan_txdone(chan, 0);
>
>                         /* Wait until channel is empty */
> -                       if (chan->active_req != NULL)
> +                       if (chan->active_req != MBOX_NO_MSG)
>                                 continue;
>
>                         return 0;
> --
> 2.52.0
>
Applied to mailbox/for-next