[PATCH] accel/amdxdna: Check interrupt register before mailbox_rx_worker exits

Lizhi Hou posted 1 patch 11 months, 2 weeks ago
There is a newer version of this series
drivers/accel/amdxdna/amdxdna_mailbox.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
[PATCH] accel/amdxdna: Check interrupt register before mailbox_rx_worker exits
Posted by Lizhi Hou 11 months, 2 weeks ago
There is a timeout failure been found during stress tests. If the firmware
generates a mailbox response right after driver clears the mailbox channel
interrupt register, the hardware will not generate an interrupt for the
response. This causes the unexpected mailbox command timeout.

To handle this failure, driver checks the interrupt register before
exiting mailbox_rx_worker(). If there is a new response, driver goes back
to process it.

Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/amdxdna_mailbox.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
index de7bf0fb4594..efe6cbc44d14 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox.c
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
@@ -348,8 +348,6 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
 	trace_mbox_irq_handle(MAILBOX_NAME, irq);
 	/* Schedule a rx_work to call the callback functions */
 	queue_work(mb_chann->work_q, &mb_chann->rx_work);
-	/* Clear IOHUB register */
-	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
 
 	return IRQ_HANDLED;
 }
@@ -357,6 +355,7 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
 static void mailbox_rx_worker(struct work_struct *rx_work)
 {
 	struct mailbox_channel *mb_chann;
+	u32 iohub;
 	int ret;
 
 	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
@@ -366,6 +365,9 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
 		return;
 	}
 
+again:
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+
 	while (1) {
 		/*
 		 * If return is 0, keep consuming next message, until there is
@@ -380,9 +382,19 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
 			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
 			WRITE_ONCE(mb_chann->bad_state, true);
 			disable_irq(mb_chann->msix_irq);
-			break;
+			return;
 		}
 	}
+
+	/*
+	 * The hardware will not generate interrupt if firmware creates a new
+	 * response right after driver clears interrupt register. Check
+	 * the interrupt register to make sure there is not any new response
+	 * before exiting.
+	 */
+	iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr);
+	if (iohub)
+		goto again;
 }
 
 int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
-- 
2.34.1
Re: [PATCH] accel/amdxdna: Check interrupt register before mailbox_rx_worker exits
Posted by Jacek Lawrynowicz 11 months, 2 weeks ago
Hi,

On 2/25/2025 6:26 PM, Lizhi Hou wrote:
> There is a timeout failure been found during stress tests. If the firmware
> generates a mailbox response right after driver clears the mailbox channel
> interrupt register, the hardware will not generate an interrupt for the
> response. This causes the unexpected mailbox command timeout.
> 
> To handle this failure, driver checks the interrupt register before
> exiting mailbox_rx_worker(). If there is a new response, driver goes back
> to process it.
> 
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
>  drivers/accel/amdxdna/amdxdna_mailbox.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
> index de7bf0fb4594..efe6cbc44d14 100644
> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
> @@ -348,8 +348,6 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
>  	trace_mbox_irq_handle(MAILBOX_NAME, irq);
>  	/* Schedule a rx_work to call the callback functions */
>  	queue_work(mb_chann->work_q, &mb_chann->rx_work);
> -	/* Clear IOHUB register */
> -	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
>  
>  	return IRQ_HANDLED;
>  }
> @@ -357,6 +355,7 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
>  static void mailbox_rx_worker(struct work_struct *rx_work)
>  {
>  	struct mailbox_channel *mb_chann;
> +	u32 iohub;
There is no need for this variable. Just use if (mailbox_reg_read()).

>  	int ret;
>  
>  	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
> @@ -366,6 +365,9 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
>  		return;
>  	}
>  
> +again:
> +	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
> +
>  	while (1) {
>  		/*
>  		 * If return is 0, keep consuming next message, until there is
> @@ -380,9 +382,19 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
>  			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
>  			WRITE_ONCE(mb_chann->bad_state, true);
>  			disable_irq(mb_chann->msix_irq);
> -			break;
> +			return;
disable_irq() should not be called here. It will be called for the second time in xdna_mailbox_stop_channel() and enable/disable calls should be balanced.

>  		}
>  	}
> +
> +	/*
> +	 * The hardware will not generate interrupt if firmware creates a new
> +	 * response right after driver clears interrupt register. Check
> +	 * the interrupt register to make sure there is not any new response
> +	 * before exiting.
> +	 */
> +	iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr);
> +	if (iohub)
> +		goto again;
>  }
>  
>  int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,

Regards,
Jacek
Re: [PATCH] accel/amdxdna: Check interrupt register before mailbox_rx_worker exits
Posted by Lizhi Hou 11 months, 2 weeks ago
On 2/26/25 01:30, Jacek Lawrynowicz wrote:
> Hi,
>
> On 2/25/2025 6:26 PM, Lizhi Hou wrote:
>> There is a timeout failure been found during stress tests. If the firmware
>> generates a mailbox response right after driver clears the mailbox channel
>> interrupt register, the hardware will not generate an interrupt for the
>> response. This causes the unexpected mailbox command timeout.
>>
>> To handle this failure, driver checks the interrupt register before
>> exiting mailbox_rx_worker(). If there is a new response, driver goes back
>> to process it.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>>   drivers/accel/amdxdna/amdxdna_mailbox.c | 18 +++++++++++++++---
>>   1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> index de7bf0fb4594..efe6cbc44d14 100644
>> --- a/drivers/accel/amdxdna/amdxdna_mailbox.c
>> +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
>> @@ -348,8 +348,6 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
>>   	trace_mbox_irq_handle(MAILBOX_NAME, irq);
>>   	/* Schedule a rx_work to call the callback functions */
>>   	queue_work(mb_chann->work_q, &mb_chann->rx_work);
>> -	/* Clear IOHUB register */
>> -	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
>>   
>>   	return IRQ_HANDLED;
>>   }
>> @@ -357,6 +355,7 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
>>   static void mailbox_rx_worker(struct work_struct *rx_work)
>>   {
>>   	struct mailbox_channel *mb_chann;
>> +	u32 iohub;
> There is no need for this variable. Just use if (mailbox_reg_read()).
Sure. Will fix this.
>
>>   	int ret;
>>   
>>   	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
>> @@ -366,6 +365,9 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
>>   		return;
>>   	}
>>   
>> +again:
>> +	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
>> +
>>   	while (1) {
>>   		/*
>>   		 * If return is 0, keep consuming next message, until there is
>> @@ -380,9 +382,19 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
>>   			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
>>   			WRITE_ONCE(mb_chann->bad_state, true);
>>   			disable_irq(mb_chann->msix_irq);
>> -			break;
>> +			return;
> disable_irq() should not be called here. It will be called for the second time in xdna_mailbox_stop_channel() and enable/disable calls should be balanced.

Agree. I will remove disable_irg().

Thanks,

Lizhi

>
>>   		}
>>   	}
>> +
>> +	/*
>> +	 * The hardware will not generate interrupt if firmware creates a new
>> +	 * response right after driver clears interrupt register. Check
>> +	 * the interrupt register to make sure there is not any new response
>> +	 * before exiting.
>> +	 */
>> +	iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr);
>> +	if (iohub)
>> +		goto again;
>>   }
>>   
>>   int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
> Regards,
> Jacek