[PATCH] accel/amdxdna: Fix incorrect command state for timed out job

Lizhi Hou posted 1 patch 3 months, 1 week ago
There is a newer version of this series
drivers/accel/amdxdna/aie2_ctx.c    | 12 ++++++++++--
drivers/accel/amdxdna/amdxdna_ctx.h |  1 +
2 files changed, 11 insertions(+), 2 deletions(-)
[PATCH] accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
commands that are canceled due to this timeout should be marked as
ERT_CMD_STATE_ABORT.

Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/aie2_ctx.c    | 12 ++++++++++--
 drivers/accel/amdxdna/amdxdna_ctx.h |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index c6c473c78352..958a64bb5251 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 
 	cmd_abo = job->cmd_bo;
 
-	if (unlikely(!data))
+	if (unlikely(job->job_timeout)) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
 		goto out;
+	}
 
-	if (unlikely(size != sizeof(u32))) {
+	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
 		ret = -EINVAL;
 		goto out;
@@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
 	int ret = 0;
 
 	cmd_abo = job->cmd_bo;
+	if (unlikely(job->job_timeout)) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+		goto out;
+	}
+
 	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
 		ret = -EINVAL;
@@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 
 	xdna = hwctx->client->xdna;
 	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+	job->job_timeout = true;
 	mutex_lock(&xdna->dev_lock);
 	aie2_hwctx_stop(xdna, hwctx, sched_job);
 
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index cbe60efbe60b..919c654dfea6 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -116,6 +116,7 @@ struct amdxdna_sched_job {
 	/* user can wait on this fence */
 	struct dma_fence	*out_fence;
 	bool			job_done;
+	bool			job_timeout;
 	u64			seq;
 	struct amdxdna_drv_cmd	*drv_cmd;
 	struct amdxdna_gem_obj	*cmd_bo;
-- 
2.34.1
Re: [PATCH] accel/amdxdna: Fix incorrect command state for timed out job
Posted by Mario Limonciello 3 months, 1 week ago
On 10/28/25 12:54 PM, Lizhi Hou wrote:
> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
> commands that are canceled due to this timeout should be marked as
> ERT_CMD_STATE_ABORT.
> 
> Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
>   drivers/accel/amdxdna/aie2_ctx.c    | 12 ++++++++++--
>   drivers/accel/amdxdna/amdxdna_ctx.h |  1 +
>   2 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index c6c473c78352..958a64bb5251 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>   
>   	cmd_abo = job->cmd_bo;
>   
> -	if (unlikely(!data))
> +	if (unlikely(job->job_timeout)) {
> +		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
>   		goto out;
> +	}
>   
> -	if (unlikely(size != sizeof(u32))) {
> +	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
>   		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>   		ret = -EINVAL;
>   		goto out;
> @@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
>   	int ret = 0;
>   
>   	cmd_abo = job->cmd_bo;
> +	if (unlikely(job->job_timeout)) {
> +		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);

Don't you need to set ret here?

> +		goto out;
> +	}
> +
>   	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>   		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>   		ret = -EINVAL;
> @@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
>   
>   	xdna = hwctx->client->xdna;
>   	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> +	job->job_timeout = true;
>   	mutex_lock(&xdna->dev_lock);
>   	aie2_hwctx_stop(xdna, hwctx, sched_job);
>   
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index cbe60efbe60b..919c654dfea6 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -116,6 +116,7 @@ struct amdxdna_sched_job {
>   	/* user can wait on this fence */
>   	struct dma_fence	*out_fence;
>   	bool			job_done;
> +	bool			job_timeout;
>   	u64			seq;
>   	struct amdxdna_drv_cmd	*drv_cmd;
>   	struct amdxdna_gem_obj	*cmd_bo;
Re: [PATCH] accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/29/25 07:28, Mario Limonciello wrote:
> On 10/28/25 12:54 PM, Lizhi Hou wrote:
>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
>> commands that are canceled due to this timeout should be marked as
>> ERT_CMD_STATE_ABORT.
>>
>> Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>>   drivers/accel/amdxdna/aie2_ctx.c    | 12 ++++++++++--
>>   drivers/accel/amdxdna/amdxdna_ctx.h |  1 +
>>   2 files changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c 
>> b/drivers/accel/amdxdna/aie2_ctx.c
>> index c6c473c78352..958a64bb5251 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void 
>> __iomem *data, size_t size)
>>         cmd_abo = job->cmd_bo;
>>   -    if (unlikely(!data))
>> +    if (unlikely(job->job_timeout)) {
>> +        amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
>>           goto out;
>> +    }
>>   -    if (unlikely(size != sizeof(u32))) {
>> +    if (unlikely(!data) || unlikely(size != sizeof(u32))) {
>>           amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>>           ret = -EINVAL;
>>           goto out;
>> @@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle, 
>> void __iomem *data, size_t size)
>>       int ret = 0;
>>         cmd_abo = job->cmd_bo;
>> +    if (unlikely(job->job_timeout)) {
>> +        amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
>
> Don't you need to set ret here?

Yes. I should set ret to -EINVAL for a timed out request.


Thanks

Lizhi

>
>> +        goto out;
>> +    }
>> +
>>       if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>>           amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>>           ret = -EINVAL;
>> @@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job 
>> *sched_job)
>>         xdna = hwctx->client->xdna;
>>       trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>> +    job->job_timeout = true;
>>       mutex_lock(&xdna->dev_lock);
>>       aie2_hwctx_stop(xdna, hwctx, sched_job);
>>   diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h 
>> b/drivers/accel/amdxdna/amdxdna_ctx.h
>> index cbe60efbe60b..919c654dfea6 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>> @@ -116,6 +116,7 @@ struct amdxdna_sched_job {
>>       /* user can wait on this fence */
>>       struct dma_fence    *out_fence;
>>       bool            job_done;
>> +    bool            job_timeout;
>>       u64            seq;
>>       struct amdxdna_drv_cmd    *drv_cmd;
>>       struct amdxdna_gem_obj    *cmd_bo;
>
Re: [PATCH] accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
> commands that are canceled due to this timeout should be marked as
> ERT_CMD_STATE_ABORT.

Would it become helpful to use additional labels for the same state settings?
https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226

Regards,
Markus
Re: [PATCH] accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/28/25 13:30, Markus Elfring wrote:
>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
>> commands that are canceled due to this timeout should be marked as
>> ERT_CMD_STATE_ABORT.
> Would it become helpful to use additional labels for the same state settings?
> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226

I do not fully understand your comment. Could you explain more?

Thanks,

Lizhi

>
> Regards,
> Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
>>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
>>> commands that are canceled due to this timeout should be marked as
>>> ERT_CMD_STATE_ABORT.
>> Would it become helpful to use additional labels for the same state settings?
>> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226
> 
> I do not fully understand your comment. Could you explain more?

Do you propose to use the same state settings in a few if branches?
How do you think about to avoid duplicate statements accordingly?

Regards,
Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/28/25 13:43, Markus Elfring wrote:
>>>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
>>>> commands that are canceled due to this timeout should be marked as
>>>> ERT_CMD_STATE_ABORT.
>>> Would it become helpful to use additional labels for the same state settings?
>>> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226
>> I do not fully understand your comment. Could you explain more?
> Do you propose to use the same state settings in a few if branches?
What are the same state settings?
> How do you think about to avoid duplicate statements accordingly?

What are the duplicate statements?

Sorry that I am still a little confused.


The code change is to distinguish between the job which is  timed out 
and the job is aborted.

Thanks,

Lizhi

>
> Regards,
> Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
> Sorry that I am still a little confused.

By the way:
How do you think about to move the statement “cmd_abo = job->cmd_bo;”
behind the applied input parameter validation?
https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226

Regards,
Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/29/25 03:40, Markus Elfring wrote:
>> Sorry that I am still a little confused.
> By the way:
> How do you think about to move the statement “cmd_abo = job->cmd_bo;”
> behind the applied input parameter validation?
> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226

With the patch, the order changed. It needs to read the timeout flag 
from cmd_abo.

Lizhi

>
> Regards,
> Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
> What are the duplicate statements?
> 
> Sorry that I am still a little confused.
> 
> 
> The code change is to distinguish between the job which is  timed out and the job is aborted.

Would you like to clarify the usage incidence for amdxdna_cmd_set_state() calls
in combination with assignments to the variable “ret” (according to the implementation
of the affected function “aie2_sched_resp_handler”)?
https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state

Regards,
Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/29/25 00:53, Markus Elfring wrote:
>> What are the duplicate statements?
>>
>> Sorry that I am still a little confused.
>>
>>
>> The code change is to distinguish between the job which is  timed out and the job is aborted.
> Would you like to clarify the usage incidence for amdxdna_cmd_set_state() calls
> in combination with assignments to the variable “ret” (according to the implementation
> of the affected function “aie2_sched_resp_handler”)?
> https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state

Sure. amdxdna_cmd_set_state() updates the return code to command buffer. 
So application which issues the command will be able to get the return code.

The function return value "ret" is used by mailbox receiving kernel 
thread to deal with the error.


Thanks,

Lizhi

>
> Regards,
> Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
>> https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state
> 
> Sure. amdxdna_cmd_set_state() updates the return code to command buffer. So application which issues the command will be able to get the return code.
> 
> The function return value "ret" is used by mailbox receiving kernel thread to deal with the error.

I miss a clearer answer for the indicated function call incidence.

Can it be helpful to determine the state value before it would be passed to a concrete call?

Regards,
Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Lizhi Hou 3 months, 1 week ago
On 10/29/25 11:54, Markus Elfring wrote:
>>> https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state
>> Sure. amdxdna_cmd_set_state() updates the return code to command buffer. So application which issues the command will be able to get the return code.
>>
>> The function return value "ret" is used by mailbox receiving kernel thread to deal with the error.
> I miss a clearer answer for the indicated function call incidence.
>
> Can it be helpful to determine the state value before it would be passed to a concrete call?

aie2_sched_resp_handler() is called either after get the firmware 
response through mailbox or the request is timed out/ canceled. So in 
this handler, it based on the response to set the state field in command 
buffer.

What do you mean here for "determine the state value before it would be 
passed to a concrete call?". What is your concern here? Maybe you can 
provide a simple patch if you think there is anything can be improved?


Thanks,

Lizhi

>
> Regards,
> Markus
Re: accel/amdxdna: Fix incorrect command state for timed out job
Posted by Markus Elfring 3 months, 1 week ago
> What do you mean here for "determine the state value before it would be passed to a concrete call?". What is your concern here?

How often are the (same) state values passed to amdxdna_cmd_set_state() calls?

Regards,
Markus