drivers/accel/amdxdna/aie2_ctx.c | 12 ++++++++++-- drivers/accel/amdxdna/amdxdna_ctx.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-)
When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
commands that are canceled due to this timeout should be marked as
ERT_CMD_STATE_ABORT.
Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
drivers/accel/amdxdna/aie2_ctx.c | 12 ++++++++++--
drivers/accel/amdxdna/amdxdna_ctx.h | 1 +
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index c6c473c78352..958a64bb5251 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
cmd_abo = job->cmd_bo;
- if (unlikely(!data))
+ if (unlikely(job->job_timeout)) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
goto out;
+ }
- if (unlikely(size != sizeof(u32))) {
+ if (unlikely(!data) || unlikely(size != sizeof(u32))) {
amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
ret = -EINVAL;
goto out;
@@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
int ret = 0;
cmd_abo = job->cmd_bo;
+ if (unlikely(job->job_timeout)) {
+ amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+ goto out;
+ }
+
if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
ret = -EINVAL;
@@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
xdna = hwctx->client->xdna;
trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+ job->job_timeout = true;
mutex_lock(&xdna->dev_lock);
aie2_hwctx_stop(xdna, hwctx, sched_job);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index cbe60efbe60b..919c654dfea6 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -116,6 +116,7 @@ struct amdxdna_sched_job {
/* user can wait on this fence */
struct dma_fence *out_fence;
bool job_done;
+ bool job_timeout;
u64 seq;
struct amdxdna_drv_cmd *drv_cmd;
struct amdxdna_gem_obj *cmd_bo;
--
2.34.1
On 10/28/25 12:54 PM, Lizhi Hou wrote:
> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
> commands that are canceled due to this timeout should be marked as
> ERT_CMD_STATE_ABORT.
>
> Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
> drivers/accel/amdxdna/aie2_ctx.c | 12 ++++++++++--
> drivers/accel/amdxdna/amdxdna_ctx.h | 1 +
> 2 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index c6c473c78352..958a64bb5251 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>
> cmd_abo = job->cmd_bo;
>
> - if (unlikely(!data))
> + if (unlikely(job->job_timeout)) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
> goto out;
> + }
>
> - if (unlikely(size != sizeof(u32))) {
> + if (unlikely(!data) || unlikely(size != sizeof(u32))) {
> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> ret = -EINVAL;
> goto out;
> @@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
> int ret = 0;
>
> cmd_abo = job->cmd_bo;
> + if (unlikely(job->job_timeout)) {
> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
Don't you need to set ret here?
> + goto out;
> + }
> +
> if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
> ret = -EINVAL;
> @@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
>
> xdna = hwctx->client->xdna;
> trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> + job->job_timeout = true;
> mutex_lock(&xdna->dev_lock);
> aie2_hwctx_stop(xdna, hwctx, sched_job);
>
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index cbe60efbe60b..919c654dfea6 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -116,6 +116,7 @@ struct amdxdna_sched_job {
> /* user can wait on this fence */
> struct dma_fence *out_fence;
> bool job_done;
> + bool job_timeout;
> u64 seq;
> struct amdxdna_drv_cmd *drv_cmd;
> struct amdxdna_gem_obj *cmd_bo;
On 10/29/25 07:28, Mario Limonciello wrote:
> On 10/28/25 12:54 PM, Lizhi Hou wrote:
>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other
>> commands that are canceled due to this timeout should be marked as
>> ERT_CMD_STATE_ABORT.
>>
>> Fixes: aac243092b70 ("accel/amdxdna: Add command execution")
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>> drivers/accel/amdxdna/aie2_ctx.c | 12 ++++++++++--
>> drivers/accel/amdxdna/amdxdna_ctx.h | 1 +
>> 2 files changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c
>> b/drivers/accel/amdxdna/aie2_ctx.c
>> index c6c473c78352..958a64bb5251 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -204,10 +204,12 @@ aie2_sched_resp_handler(void *handle, void
>> __iomem *data, size_t size)
>> cmd_abo = job->cmd_bo;
>> - if (unlikely(!data))
>> + if (unlikely(job->job_timeout)) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
>> goto out;
>> + }
>> - if (unlikely(size != sizeof(u32))) {
>> + if (unlikely(!data) || unlikely(size != sizeof(u32))) {
>> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>> ret = -EINVAL;
>> goto out;
>> @@ -258,6 +260,11 @@ aie2_sched_cmdlist_resp_handler(void *handle,
>> void __iomem *data, size_t size)
>> int ret = 0;
>> cmd_abo = job->cmd_bo;
>> + if (unlikely(job->job_timeout)) {
>> + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
>
> Don't you need to set ret here?
Yes. I should set ret to -EINVAL for a timed out request.
Thanks
Lizhi
>
>> + goto out;
>> + }
>> +
>> if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
>> ret = -EINVAL;
>> @@ -370,6 +377,7 @@ aie2_sched_job_timedout(struct drm_sched_job
>> *sched_job)
>> xdna = hwctx->client->xdna;
>> trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>> + job->job_timeout = true;
>> mutex_lock(&xdna->dev_lock);
>> aie2_hwctx_stop(xdna, hwctx, sched_job);
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h
>> b/drivers/accel/amdxdna/amdxdna_ctx.h
>> index cbe60efbe60b..919c654dfea6 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>> @@ -116,6 +116,7 @@ struct amdxdna_sched_job {
>> /* user can wait on this fence */
>> struct dma_fence *out_fence;
>> bool job_done;
>> + bool job_timeout;
>> u64 seq;
>> struct amdxdna_drv_cmd *drv_cmd;
>> struct amdxdna_gem_obj *cmd_bo;
>
> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other > commands that are canceled due to this timeout should be marked as > ERT_CMD_STATE_ABORT. Would it become helpful to use additional labels for the same state settings? https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 Regards, Markus
On 10/28/25 13:30, Markus Elfring wrote: >> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other >> commands that are canceled due to this timeout should be marked as >> ERT_CMD_STATE_ABORT. > Would it become helpful to use additional labels for the same state settings? > https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 I do not fully understand your comment. Could you explain more? Thanks, Lizhi > > Regards, > Markus
>>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other >>> commands that are canceled due to this timeout should be marked as >>> ERT_CMD_STATE_ABORT. >> Would it become helpful to use additional labels for the same state settings? >> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 > > I do not fully understand your comment. Could you explain more? Do you propose to use the same state settings in a few if branches? How do you think about to avoid duplicate statements accordingly? Regards, Markus
On 10/28/25 13:43, Markus Elfring wrote: >>>> When a command times out, mark it as ERT_CMD_STATE_TIMEOUT. Any other >>>> commands that are canceled due to this timeout should be marked as >>>> ERT_CMD_STATE_ABORT. >>> Would it become helpful to use additional labels for the same state settings? >>> https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 >> I do not fully understand your comment. Could you explain more? > Do you propose to use the same state settings in a few if branches? What are the same state settings? > How do you think about to avoid duplicate statements accordingly? What are the duplicate statements? Sorry that I am still a little confused. The code change is to distinguish between the job which is timed out and the job is aborted. Thanks, Lizhi > > Regards, > Markus
> Sorry that I am still a little confused. By the way: How do you think about to move the statement “cmd_abo = job->cmd_bo;” behind the applied input parameter validation? https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 Regards, Markus
On 10/29/25 03:40, Markus Elfring wrote: >> Sorry that I am still a little confused. > By the way: > How do you think about to move the statement “cmd_abo = job->cmd_bo;” > behind the applied input parameter validation? > https://elixir.bootlin.com/linux/v6.18-rc3/source/drivers/accel/amdxdna/aie2_ctx.c#L197-L226 With the patch, the order changed. It needs to read the timeout flag from cmd_abo. Lizhi > > Regards, > Markus
> What are the duplicate statements? > > Sorry that I am still a little confused. > > > The code change is to distinguish between the job which is timed out and the job is aborted. Would you like to clarify the usage incidence for amdxdna_cmd_set_state() calls in combination with assignments to the variable “ret” (according to the implementation of the affected function “aie2_sched_resp_handler”)? https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state Regards, Markus
On 10/29/25 00:53, Markus Elfring wrote: >> What are the duplicate statements? >> >> Sorry that I am still a little confused. >> >> >> The code change is to distinguish between the job which is timed out and the job is aborted. > Would you like to clarify the usage incidence for amdxdna_cmd_set_state() calls > in combination with assignments to the variable “ret” (according to the implementation > of the affected function “aie2_sched_resp_handler”)? > https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state Sure. amdxdna_cmd_set_state() updates the return code to command buffer. So application which issues the command will be able to get the return code. The function return value "ret" is used by mailbox receiving kernel thread to deal with the error. Thanks, Lizhi > > Regards, > Markus
>> https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state > > Sure. amdxdna_cmd_set_state() updates the return code to command buffer. So application which issues the command will be able to get the return code. > > The function return value "ret" is used by mailbox receiving kernel thread to deal with the error. I miss a clearer answer for the indicated function call incidence. Can it be helpful to determine the state value before it would be passed to a concrete call? Regards, Markus
On 10/29/25 11:54, Markus Elfring wrote: >>> https://elixir.bootlin.com/linux/v6.18-rc3/C/ident/amdxdna_cmd_set_state >> Sure. amdxdna_cmd_set_state() updates the return code to command buffer. So application which issues the command will be able to get the return code. >> >> The function return value "ret" is used by mailbox receiving kernel thread to deal with the error. > I miss a clearer answer for the indicated function call incidence. > > Can it be helpful to determine the state value before it would be passed to a concrete call? aie2_sched_resp_handler() is called either after get the firmware response through mailbox or the request is timed out/ canceled. So in this handler, it based on the response to set the state field in command buffer. What do you mean here for "determine the state value before it would be passed to a concrete call?". What is your concern here? Maybe you can provide a simple patch if you think there is anything can be improved? Thanks, Lizhi > > Regards, > Markus
> What do you mean here for "determine the state value before it would be passed to a concrete call?". What is your concern here? How often are the (same) state values passed to amdxdna_cmd_set_state() calls? Regards, Markus
© 2016 - 2026 Red Hat, Inc.