accel/amdxdna: Fix race where send ring appears full due to delayed head update

[PATCH V1] accel/amdxdna: Fix race where send ring appears full due to delayed head update

Posted by Lizhi Hou 1 month, 4 weeks ago

The firmware sends a response and interrupts the driver before advancing
the mailbox send ring head pointer. As a result, the driver may observe
the response and attempt to send a new request before the firmware has
updated the head pointer. In this window, the send ring still appears
full, causing the driver to incorrectly fail the send operation.

This race can be triggered more easily in a multithreaded environment,
leading to unexpected and spurious "send ring full" failures.

To address this, poll the send ring head pointer for up to 100us before
returning a full-ring condition. This allows the firmware time to update
the head pointer.

Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
index a60a85ce564c..469242ed8224 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox.c
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
@@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
 	u32 head, tail;
 	u32 start_addr;
 	u32 tmp_tail;
+	int ret;
 
 	head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
 	tail = mb_chann->x2i_tail;
-	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
+	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) - sizeof(u32);
 	start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
 	tmp_tail = tail + mb_msg->pkg_size;
 
-	if (tail < head && tmp_tail >= head)
-		goto no_space;
-
-	if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
-			     mb_msg->pkg_size >= head))
-		goto no_space;
 
-	if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
+check_again:
+	if (tail >= head && tmp_tail > ringbuf_size) {
 		write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
 		writel(TOMBSTONE, write_addr);
 
 		/* tombstone is set. Write from the start of the ringbuf */
 		tail = 0;
+		tmp_tail = tail + mb_msg->pkg_size;
+	}
+
+	if (tail < head && tmp_tail >= head) {
+		ret = read_poll_timeout(mailbox_get_headptr, head,
+					tmp_tail < head || tail >= head,
+					1, 100, false, mb_chann, CHAN_RES_X2I);
+		if (ret)
+			return ret;
+
+		if (tail >= head)
+			goto check_again;
 	}
 
 	write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
@@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
 			    mb_msg->pkg.header.id);
 
 	return 0;
-
-no_space:
-	return -ENOSPC;
 }
 
 static int
-- 
2.34.1

Re: [PATCH V1] accel/amdxdna: Fix race where send ring appears full due to delayed head update

Posted by Mario Limonciello 1 month, 4 weeks ago

On 12/10/25 10:51 PM, Lizhi Hou wrote:
> The firmware sends a response and interrupts the driver before advancing
> the mailbox send ring head pointer. 

What's the point of the interrupt then?  Is this possible to improve in 
future firmware or is this really a hardware issue?  If it can be fixed 
in firmware it would be ideal to conditionalize such behavior on 
firmware version.
> As a result, the driver may observe
> the response and attempt to send a new request before the firmware has
> updated the head pointer. In this window, the send ring still appears
> full, causing the driver to incorrectly fail the send operation.
> 
> This race can be triggered more easily in a multithreaded environment,
> leading to unexpected and spurious "send ring full" failures.
> 
> To address this, poll the send ring head pointer for up to 100us before
> returning a full-ring condition. This allows the firmware time to update
> the head pointer.
> 
> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
>   drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
>   1 file changed, 16 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
> index a60a85ce564c..469242ed8224 100644
> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
>   	u32 head, tail;
>   	u32 start_addr;
>   	u32 tmp_tail;
> +	int ret;
>   
>   	head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>   	tail = mb_chann->x2i_tail;
> -	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
> +	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) - sizeof(u32);
>   	start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>   	tmp_tail = tail + mb_msg->pkg_size;
>   
> -	if (tail < head && tmp_tail >= head)
> -		goto no_space;
> -
> -	if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
> -			     mb_msg->pkg_size >= head))
> -		goto no_space;
>   
> -	if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
> +check_again:
> +	if (tail >= head && tmp_tail > ringbuf_size) {
>   		write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
>   		writel(TOMBSTONE, write_addr);
>   
>   		/* tombstone is set. Write from the start of the ringbuf */
>   		tail = 0;
> +		tmp_tail = tail + mb_msg->pkg_size;
> +	}
> +
> +	if (tail < head && tmp_tail >= head) {
> +		ret = read_poll_timeout(mailbox_get_headptr, head,
> +					tmp_tail < head || tail >= head,
> +					1, 100, false, mb_chann, CHAN_RES_X2I);
> +		if (ret)
> +			return ret;
> +
> +		if (tail >= head)
> +			goto check_again;
>   	}
>   
>   	write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
>   			    mb_msg->pkg.header.id);
>   
>   	return 0;
> -
> -no_space:
> -	return -ENOSPC;
>   }
>   
>   static int

Re: [PATCH V1] accel/amdxdna: Fix race where send ring appears full due to delayed head update

Posted by Lizhi Hou 1 month, 4 weeks ago

On 12/11/25 13:48, Mario Limonciello wrote:
> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>> The firmware sends a response and interrupts the driver before advancing
>> the mailbox send ring head pointer. 
>
> What's the point of the interrupt then?  Is this possible to improve 
> in future firmware or is this really a hardware issue?  If it can be 
> fixed in firmware it would be ideal to conditionalize such behavior on 
> firmware version.

This has been found recently with a muti-thread stress test with 
released firmware. My understanding is that this is a firmware issue. I 
am not sure if it could be improved in future firmware.

I believe this driver change is more robust. It does not change the 
logic but adds an polling instead of returning error. If the firmware 
updates header quick, there will not be any polling. It works for both 
current and future firmware.


Thanks,

Lizhi

>> As a result, the driver may observe
>> the response and attempt to send a new request before the firmware has
>> updated the head pointer. In this window, the send ring still appears
>> full, causing the driver to incorrectly fail the send operation.
>>
>> This race can be triggered more easily in a multithreaded environment,
>> leading to unexpected and spurious "send ring full" failures.
>>
>> To address this, poll the send ring head pointer for up to 100us before
>> returning a full-ring condition. This allows the firmware time to update
>> the head pointer.
>>
>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>>   drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
>>   1 file changed, 16 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c 
>> b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> index a60a85ce564c..469242ed8224 100644
>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel 
>> *mb_chann, struct mailbox_msg *mb_msg)
>>       u32 head, tail;
>>       u32 start_addr;
>>       u32 tmp_tail;
>> +    int ret;
>>         head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>>       tail = mb_chann->x2i_tail;
>> -    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>> +    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) 
>> - sizeof(u32);
>>       start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>>       tmp_tail = tail + mb_msg->pkg_size;
>>   -    if (tail < head && tmp_tail >= head)
>> -        goto no_space;
>> -
>> -    if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>> -                 mb_msg->pkg_size >= head))
>> -        goto no_space;
>>   -    if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>> +check_again:
>> +    if (tail >= head && tmp_tail > ringbuf_size) {
>>           write_addr = mb_chann->mb->res.ringbuf_base + start_addr + 
>> tail;
>>           writel(TOMBSTONE, write_addr);
>>             /* tombstone is set. Write from the start of the ringbuf */
>>           tail = 0;
>> +        tmp_tail = tail + mb_msg->pkg_size;
>> +    }
>> +
>> +    if (tail < head && tmp_tail >= head) {
>> +        ret = read_poll_timeout(mailbox_get_headptr, head,
>> +                    tmp_tail < head || tail >= head,
>> +                    1, 100, false, mb_chann, CHAN_RES_X2I);
>> +        if (ret)
>> +            return ret;
>> +
>> +        if (tail >= head)
>> +            goto check_again;
>>       }
>>         write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel 
>> *mb_chann, struct mailbox_msg *mb_msg)
>>                   mb_msg->pkg.header.id);
>>         return 0;
>> -
>> -no_space:
>> -    return -ENOSPC;
>>   }
>>     static int
>

Re: [PATCH V1] accel/amdxdna: Fix race where send ring appears full due to delayed head update

Posted by Mario Limonciello 1 month, 3 weeks ago

On 12/11/25 10:41 PM, Lizhi Hou wrote:
> 
> On 12/11/25 13:48, Mario Limonciello wrote:
>> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>>> The firmware sends a response and interrupts the driver before advancing
>>> the mailbox send ring head pointer. 
>>
>> What's the point of the interrupt then?  Is this possible to improve 
>> in future firmware or is this really a hardware issue?  If it can be 
>> fixed in firmware it would be ideal to conditionalize such behavior on 
>> firmware version.
> 
> This has been found recently with a muti-thread stress test with 
> released firmware. My understanding is that this is a firmware issue. I 
> am not sure if it could be improved in future firmware.

OK, thanks for explaining.

> 
> I believe this driver change is more robust. It does not change the 
> logic but adds an polling instead of returning error. If the firmware 
> updates header quick, there will not be any polling. It works for both 
> current and future firmware.
> 
> 

Yeah I don't think it's harmful, it's just unfortunate to have to land 
code like this when it sounds like a firmware bug with interrupt 
handling.  If the firmware can be improved at least it will be right on 
the very first read in the polling.

Reviewed-by: Mario Limonciello (AMD)

> Thanks,
> 
> Lizhi
> 
>>> As a result, the driver may observe
>>> the response and attempt to send a new request before the firmware has
>>> updated the head pointer. In this window, the send ring still appears
>>> full, causing the driver to incorrectly fail the send operation.
>>>
>>> This race can be triggered more easily in a multithreaded environment,
>>> leading to unexpected and spurious "send ring full" failures.
>>>
>>> To address this, poll the send ring head pointer for up to 100us before
>>> returning a full-ring condition. This allows the firmware time to update
>>> the head pointer.
>>>
>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>> ---
>>>   drivers/accel/amdxdna/amdxdna_mailbox.c | 27 +++++++++++++++----------
>>>   1 file changed, 16 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/ 
>>> amdxdna/amdxdna_mailbox.c
>>> index a60a85ce564c..469242ed8224 100644
>>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel 
>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>       u32 head, tail;
>>>       u32 start_addr;
>>>       u32 tmp_tail;
>>> +    int ret;
>>>         head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>>>       tail = mb_chann->x2i_tail;
>>> -    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>>> +    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I) 
>>> - sizeof(u32);
>>>       start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>>>       tmp_tail = tail + mb_msg->pkg_size;
>>>   -    if (tail < head && tmp_tail >= head)
>>> -        goto no_space;
>>> -
>>> -    if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>>> -                 mb_msg->pkg_size >= head))
>>> -        goto no_space;
>>>   -    if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>>> +check_again:
>>> +    if (tail >= head && tmp_tail > ringbuf_size) {
>>>           write_addr = mb_chann->mb->res.ringbuf_base + start_addr + 
>>> tail;
>>>           writel(TOMBSTONE, write_addr);
>>>             /* tombstone is set. Write from the start of the ringbuf */
>>>           tail = 0;
>>> +        tmp_tail = tail + mb_msg->pkg_size;
>>> +    }
>>> +
>>> +    if (tail < head && tmp_tail >= head) {
>>> +        ret = read_poll_timeout(mailbox_get_headptr, head,
>>> +                    tmp_tail < head || tail >= head,
>>> +                    1, 100, false, mb_chann, CHAN_RES_X2I);
>>> +        if (ret)
>>> +            return ret;
>>> +
>>> +        if (tail >= head)
>>> +            goto check_again;
>>>       }
>>>         write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
>>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel 
>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>                   mb_msg->pkg.header.id);
>>>         return 0;
>>> -
>>> -no_space:
>>> -    return -ENOSPC;
>>>   }
>>>     static int
>>

Re: [PATCH V1] accel/amdxdna: Fix race where send ring appears full due to delayed head update

Posted by Lizhi Hou 1 month, 3 weeks ago

Applied to drm-misc-next.

On 12/12/25 08:43, Mario Limonciello wrote:
> On 12/11/25 10:41 PM, Lizhi Hou wrote:
>>
>> On 12/11/25 13:48, Mario Limonciello wrote:
>>> On 12/10/25 10:51 PM, Lizhi Hou wrote:
>>>> The firmware sends a response and interrupts the driver before 
>>>> advancing
>>>> the mailbox send ring head pointer. 
>>>
>>> What's the point of the interrupt then?  Is this possible to improve 
>>> in future firmware or is this really a hardware issue?  If it can be 
>>> fixed in firmware it would be ideal to conditionalize such behavior 
>>> on firmware version.
>>
>> This has been found recently with a muti-thread stress test with 
>> released firmware. My understanding is that this is a firmware issue. 
>> I am not sure if it could be improved in future firmware.
>
> OK, thanks for explaining.
>
>>
>> I believe this driver change is more robust. It does not change the 
>> logic but adds an polling instead of returning error. If the firmware 
>> updates header quick, there will not be any polling. It works for 
>> both current and future firmware.
>>
>>
>
> Yeah I don't think it's harmful, it's just unfortunate to have to land 
> code like this when it sounds like a firmware bug with interrupt 
> handling.  If the firmware can be improved at least it will be right 
> on the very first read in the polling.
>
> Reviewed-by: Mario Limonciello (AMD)
>
>> Thanks,
>>
>> Lizhi
>>
>>>> As a result, the driver may observe
>>>> the response and attempt to send a new request before the firmware has
>>>> updated the head pointer. In this window, the send ring still appears
>>>> full, causing the driver to incorrectly fail the send operation.
>>>>
>>>> This race can be triggered more easily in a multithreaded environment,
>>>> leading to unexpected and spurious "send ring full" failures.
>>>>
>>>> To address this, poll the send ring head pointer for up to 100us 
>>>> before
>>>> returning a full-ring condition. This allows the firmware time to 
>>>> update
>>>> the head pointer.
>>>>
>>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>> ---
>>>>   drivers/accel/amdxdna/amdxdna_mailbox.c | 27 
>>>> +++++++++++++++----------
>>>>   1 file changed, 16 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c 
>>>> b/drivers/accel/ amdxdna/amdxdna_mailbox.c
>>>> index a60a85ce564c..469242ed8224 100644
>>>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>>>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>>>> @@ -191,26 +191,34 @@ mailbox_send_msg(struct mailbox_channel 
>>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>>       u32 head, tail;
>>>>       u32 start_addr;
>>>>       u32 tmp_tail;
>>>> +    int ret;
>>>>         head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
>>>>       tail = mb_chann->x2i_tail;
>>>> -    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
>>>> +    ringbuf_size = mailbox_get_ringbuf_size(mb_chann, 
>>>> CHAN_RES_X2I) - sizeof(u32);
>>>>       start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
>>>>       tmp_tail = tail + mb_msg->pkg_size;
>>>>   -    if (tail < head && tmp_tail >= head)
>>>> -        goto no_space;
>>>> -
>>>> -    if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
>>>> -                 mb_msg->pkg_size >= head))
>>>> -        goto no_space;
>>>>   -    if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
>>>> +check_again:
>>>> +    if (tail >= head && tmp_tail > ringbuf_size) {
>>>>           write_addr = mb_chann->mb->res.ringbuf_base + start_addr 
>>>> + tail;
>>>>           writel(TOMBSTONE, write_addr);
>>>>             /* tombstone is set. Write from the start of the 
>>>> ringbuf */
>>>>           tail = 0;
>>>> +        tmp_tail = tail + mb_msg->pkg_size;
>>>> +    }
>>>> +
>>>> +    if (tail < head && tmp_tail >= head) {
>>>> +        ret = read_poll_timeout(mailbox_get_headptr, head,
>>>> +                    tmp_tail < head || tail >= head,
>>>> +                    1, 100, false, mb_chann, CHAN_RES_X2I);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +
>>>> +        if (tail >= head)
>>>> +            goto check_again;
>>>>       }
>>>>         write_addr = mb_chann->mb->res.ringbuf_base + start_addr + 
>>>> tail;
>>>> @@ -222,9 +230,6 @@ mailbox_send_msg(struct mailbox_channel 
>>>> *mb_chann, struct mailbox_msg *mb_msg)
>>>>                   mb_msg->pkg.header.id);
>>>>         return 0;
>>>> -
>>>> -no_space:
>>>> -    return -ENOSPC;
>>>>   }
>>>>     static int
>>>
>