drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-)
The firmware sends a response and interrupts the driver before advancing
the mailbox send ring head pointer. As a result, the driver may observe
the response and attempt to send a new request before the firmware has
updated the head pointer. In this window, the send ring still appears
full, causing the driver to incorrectly fail the send operation.
This race can be triggered more easily in a multithreaded environment,
leading to unexpected and spurious "send ring full" failures.
To address this, poll the send ring head pointer for up to 100us before
returning a full-ring condition. This allows the firmware time to update
the head pointer.
Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
index a60a85ce564c..469242ed8224 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox.c
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
@@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
u32 head, tail;
u32 start_addr;
u32 tmp_tail;
+ int ret;
head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
tail = mb_chann->x2i_tail;
- ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
+ ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) - sizeof(u32);
start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
tmp_tail = tail + mb_msg->pkg_size;
- if (tail < head && tmp_tail >= head)
- goto no_space;
-
- if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
- mb_msg->pkg_size >= head))
- goto no_space;
- if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
+check_again:
+ if (tail >= head && tmp_tail > ringbuf_size) {
write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
writel(TOMBSTONE, write_addr);
/* tombstone is set. Write from the start of the ringbuf */
tail = 0;
+ tmp_tail = tail + mb_msg->pkg_size;
+ }
+
+ if (tail < head && tmp_tail >= head) {
+ ret = read_poll_timeout(mailbox_get_headptr, head,
+ tmp_tail < head || tail >= head,
+ 1, 100, false, mb_chann, CHAN_RES_X2I);
+ if (ret)
+ return ret;
+
+ if (tail >= head)
+ goto check_again;
}
write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
@@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
mb_msg->pkg.header.id);
return 0;
-
-no_space:
- return -ENOSPC;
}
static int
--
2.34.1
On 12/10/25 10:51 PM, Lizhi Hou wrote:
> The firmware sends a response and interrupts the driver before advancing
> the mailbox send ring head pointer.
What's the point of the interrupt then? Is this possible to improve in
future firmware or is this really a hardware issue? If it can be fixed
in firmware it would be ideal to conditionalize such behavior on
firmware version.
> As a result, the driver may observe
> the response and attempt to send a new request before the firmware has
> updated the head pointer. In this window, the send ring still appears
> full, causing the driver to incorrectly fail the send operation.
>
> This race can be triggered more easily in a multithreaded environment,
> leading to unexpected and spurious "send ring full" failures.
>
> To address this, poll the send ring head pointer for up to 100us before
> returning a full-ring condition. This allows the firmware time to update
> the head pointer.
>
> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
> drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
> 1 file changed, 16 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
> index a60a85ce564c..469242ed8224 100644
> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
> u32 head, tail;
> u32 start_addr;
> u32 tmp_tail;
> + int ret;
>
> head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
> tail = mb_chann->x2i_tail;
> - ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
> + ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) - sizeof(u32);
> start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
> tmp_tail = tail + mb_msg->pkg_size;
>
> - if (tail < head && tmp_tail >= head)
> - goto no_space;
> -
> - if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
> - mb_msg->pkg_size >= head))
> - goto no_space;
>
> - if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
> +check_again:
> + if (tail >= head && tmp_tail > ringbuf_size) {
> write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
> writel(TOMBSTONE, write_addr);
>
> /* tombstone is set. Write from the start of the ringbuf */
> tail = 0;
> + tmp_tail = tail + mb_msg->pkg_size;
> + }
> +
> + if (tail < head && tmp_tail >= head) {
> + ret = read_poll_timeout(mailbox_get_headptr, head,
> + tmp_tail < head || tail >= head,
> + 1, 100, false, mb_chann, CHAN_RES_X2I);
> + if (ret)
> + return ret;
> +
> + if (tail >= head)
> + goto check_again;
> }
>
> write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
> mb_msg->pkg.header.id);
>
> return 0;
> -
> -no_space:
> - return -ENOSPC;
> }
>
> static int
On 12/11/25 13:48, Mario Limonciello wrote:
> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>> The firmware sends a response and interrupts the driver before advancing
>> the mailbox send ring head pointer.
>
> What's the point of the interrupt then? Is this possible to improve
> in future firmware or is this really a hardware issue? If it can be
> fixed in firmware it would be ideal to conditionalize such behavior on
> firmware version.
This has been found recently with a muti-thread stress test with
released firmware. My understanding is that this is a firmware issue. I
am not sure if it could be improved in future firmware.
I believe this driver change is more robust. It does not change the
logic but adds an polling instead of returning error. If the firmware
updates header quick, there will not be any polling. It works for both
current and future firmware.
Thanks,
Lizhi
>> As a result, the driver may observe
>> the response and attempt to send a new request before the firmware has
>> updated the head pointer. In this window, the send ring still appears
>> full, causing the driver to incorrectly fail the send operation.
>>
>> This race can be triggered more easily in a multithreaded environment,
>> leading to unexpected and spurious "send ring full" failures.
>>
>> To address this, poll the send ring head pointer for up to 100us before
>> returning a full-ring condition. This allows the firmware time to update
>> the head pointer.
>>
>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>> drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
>> 1 file changed, 16 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c
>> b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> index a60a85ce564c..469242ed8224 100644
>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel
>> *mb_chann, struct mailbox_msg *mb_msg)
>> u32 head, tail;
>> u32 start_addr;
>> u32 tmp_tail;
>> + int ret;
>> head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>> tail = mb_chann->x2i_tail;
>> - ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>> + ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I)
>> - sizeof(u32);
>> start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>> tmp_tail = tail + mb_msg->pkg_size;
>> - if (tail < head && tmp_tail >= head)
>> - goto no_space;
>> -
>> - if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>> - mb_msg->pkg_size >= head))
>> - goto no_space;
>> - if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>> +check_again:
>> + if (tail >= head && tmp_tail > ringbuf_size) {
>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr +
>> tail;
>> writel(TOMBSTONE, write_addr);
>> /* tombstone is set. Write from the start of the ringbuf */
>> tail = 0;
>> + tmp_tail = tail + mb_msg->pkg_size;
>> + }
>> +
>> + if (tail < head && tmp_tail >= head) {
>> + ret = read_poll_timeout(mailbox_get_headptr, head,
>> + tmp_tail < head || tail >= head,
>> + 1, 100, false, mb_chann, CHAN_RES_X2I);
>> + if (ret)
>> + return ret;
>> +
>> + if (tail >= head)
>> + goto check_again;
>> }
>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel
>> *mb_chann, struct mailbox_msg *mb_msg)
>> mb_msg->pkg.header.id);
>> return 0;
>> -
>> -no_space:
>> - return -ENOSPC;
>> }
>> static int
>
On 12/11/25 10:41 PM, Lizhi Hou wrote:
>
> On 12/11/25 13:48, Mario Limonciello wrote:
>> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>>> The firmware sends a response and interrupts the driver before advancing
>>> the mailbox send ring head pointer.
>>
>> What's the point of the interrupt then? Is this possible to improve
>> in future firmware or is this really a hardware issue? If it can be
>> fixed in firmware it would be ideal to conditionalize such behavior on
>> firmware version.
>
> This has been found recently with a muti-thread stress test with
> released firmware. My understanding is that this is a firmware issue. I
> am not sure if it could be improved in future firmware.
OK, thanks for explaining.
>
> I believe this driver change is more robust. It does not change the
> logic but adds an polling instead of returning error. If the firmware
> updates header quick, there will not be any polling. It works for both
> current and future firmware.
>
>
Yeah I don't think it's harmful, it's just unfortunate to have to land
code like this when it sounds like a firmware bug with interrupt
handling. If the firmware can be improved at least it will be right on
the very first read in the polling.
Reviewed-by: Mario Limonciello (AMD)
> Thanks,
>
> Lizhi
>
>>> As a result, the driver may observe
>>> the response and attempt to send a new request before the firmware has
>>> updated the head pointer. In this window, the send ring still appears
>>> full, causing the driver to incorrectly fail the send operation.
>>>
>>> This race can be triggered more easily in a multithreaded environment,
>>> leading to unexpected and spurious "send ring full" failures.
>>>
>>> To address this, poll the send ring head pointer for up to 100us before
>>> returning a full-ring condition. This allows the firmware time to update
>>> the head pointer.
>>>
>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>> ---
>>> drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
>>> 1 file changed, 16 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/
>>> amdxdna/amdxdna_mailbox.c
>>> index a60a85ce564c..469242ed8224 100644
>>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel
>>> *mb_chann, struct mailbox_msg *mb_msg)
>>> u32 head, tail;
>>> u32 start_addr;
>>> u32 tmp_tail;
>>> + int ret;
>>> head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>>> tail = mb_chann->x2i_tail;
>>> - ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>>> + ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I)
>>> - sizeof(u32);
>>> start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>>> tmp_tail = tail + mb_msg->pkg_size;
>>> - if (tail < head && tmp_tail >= head)
>>> - goto no_space;
>>> -
>>> - if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>>> - mb_msg->pkg_size >= head))
>>> - goto no_space;
>>> - if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>>> +check_again:
>>> + if (tail >= head && tmp_tail > ringbuf_size) {
>>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr +
>>> tail;
>>> writel(TOMBSTONE, write_addr);
>>> /* tombstone is set. Write from the start of the ringbuf */
>>> tail = 0;
>>> + tmp_tail = tail + mb_msg->pkg_size;
>>> + }
>>> +
>>> + if (tail < head && tmp_tail >= head) {
>>> + ret = read_poll_timeout(mailbox_get_headptr, head,
>>> + tmp_tail < head || tail >= head,
>>> + 1, 100, false, mb_chann, CHAN_RES_X2I);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + if (tail >= head)
>>> + goto check_again;
>>> }
>>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
>>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel
>>> *mb_chann, struct mailbox_msg *mb_msg)
>>> mb_msg->pkg.header.id);
>>> return 0;
>>> -
>>> -no_space:
>>> - return -ENOSPC;
>>> }
>>> static int
>>
Applied to drm-misc-next.
On 12/12/25 08:43, Mario Limonciello wrote:
> On 12/11/25 10:41 PM, Lizhi Hou wrote:
>>
>> On 12/11/25 13:48, Mario Limonciello wrote:
>>> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>>>> The firmware sends a response and interrupts the driver before
>>>> advancing
>>>> the mailbox send ring head pointer.
>>>
>>> What's the point of the interrupt then? Is this possible to improve
>>> in future firmware or is this really a hardware issue? If it can be
>>> fixed in firmware it would be ideal to conditionalize such behavior
>>> on firmware version.
>>
>> This has been found recently with a muti-thread stress test with
>> released firmware. My understanding is that this is a firmware issue.
>> I am not sure if it could be improved in future firmware.
>
> OK, thanks for explaining.
>
>>
>> I believe this driver change is more robust. It does not change the
>> logic but adds an polling instead of returning error. If the firmware
>> updates header quick, there will not be any polling. It works for
>> both current and future firmware.
>>
>>
>
> Yeah I don't think it's harmful, it's just unfortunate to have to land
> code like this when it sounds like a firmware bug with interrupt
> handling. If the firmware can be improved at least it will be right
> on the very first read in the polling.
>
> Reviewed-by: Mario Limonciello (AMD)
>
>> Thanks,
>>
>> Lizhi
>>
>>>> As a result, the driver may observe
>>>> the response and attempt to send a new request before the firmware has
>>>> updated the head pointer. In this window, the send ring still appears
>>>> full, causing the driver to incorrectly fail the send operation.
>>>>
>>>> This race can be triggered more easily in a multithreaded environment,
>>>> leading to unexpected and spurious "send ring full" failures.
>>>>
>>>> To address this, poll the send ring head pointer for up to 100us
>>>> before
>>>> returning a full-ring condition. This allows the firmware time to
>>>> update
>>>> the head pointer.
>>>>
>>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>> ---
>>>> drivers/accel/amdxdna/amdxdna_mailbox.c | 27
>>>> +++++++++++++++----------
>>>> 1 file changed, 16 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>>> b/drivers/accel/ amdxdna/amdxdna_mailbox.c
>>>> index a60a85ce564c..469242ed8224 100644
>>>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>>>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel
>>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>> u32 head, tail;
>>>> u32 start_addr;
>>>> u32 tmp_tail;
>>>> + int ret;
>>>> head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>>>> tail = mb_chann->x2i_tail;
>>>> - ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>>>> + ringbuf_size = mailbox_get_ringbuf_size(mb_chann,
>>>> CHAN_RES_X2I) - sizeof(u32);
>>>> start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>>>> tmp_tail = tail + mb_msg->pkg_size;
>>>> - if (tail < head && tmp_tail >= head)
>>>> - goto no_space;
>>>> -
>>>> - if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>>>> - mb_msg->pkg_size >= head))
>>>> - goto no_space;
>>>> - if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>>>> +check_again:
>>>> + if (tail >= head && tmp_tail > ringbuf_size) {
>>>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr
>>>> + tail;
>>>> writel(TOMBSTONE, write_addr);
>>>> /* tombstone is set. Write from the start of the
>>>> ringbuf */
>>>> tail = 0;
>>>> + tmp_tail = tail + mb_msg->pkg_size;
>>>> + }
>>>> +
>>>> + if (tail < head && tmp_tail >= head) {
>>>> + ret = read_poll_timeout(mailbox_get_headptr, head,
>>>> + tmp_tail < head || tail >= head,
>>>> + 1, 100, false, mb_chann, CHAN_RES_X2I);
>>>> + if (ret)
>>>> + return ret;
>>>> +
>>>> + if (tail >= head)
>>>> + goto check_again;
>>>> }
>>>> write_addr = mb_chann->mb->res.ringbuf_base + start_addr +
>>>> tail;
>>>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel
>>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>> mb_msg->pkg.header.id);
>>>> return 0;
>>>> -
>>>> -no_space:
>>>> - return -ENOSPC;
>>>> }
>>>> static int
>>>
>
© 2016 - 2025 Red Hat, Inc.