drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++--- drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++ drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++ drivers/accel/amdxdna/aie2_pci.c | 14 +++++ drivers/accel/amdxdna/aie2_pci.h | 5 ++ drivers/accel/amdxdna/amdxdna_ctx.c | 6 +- drivers/accel/amdxdna/amdxdna_ctx.h | 18 +++++- drivers/accel/amdxdna/npu4_regs.c | 3 +- 8 files changed, 213 insertions(+), 11 deletions(-)
The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.
When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++---
drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++
drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
drivers/accel/amdxdna/aie2_pci.c | 14 +++++
drivers/accel/amdxdna/aie2_pci.h | 5 ++
drivers/accel/amdxdna/amdxdna_ctx.c | 6 +-
drivers/accel/amdxdna/amdxdna_ctx.h | 18 +++++-
drivers/accel/amdxdna/npu4_regs.c | 3 +-
8 files changed, 213 insertions(+), 11 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 779ac70d62d7..6292349868c5 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
#define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
+struct aie2_ctx_health {
+ struct amdxdna_ctx_health header;
+ u32 txn_op_idx;
+ u32 ctx_pc;
+ u32 fatal_error_type;
+ u32 fatal_error_exception_type;
+ u32 fatal_error_exception_pc;
+ u32 fatal_error_app_module;
+};
+
static void aie2_job_release(struct kref *ref)
{
struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
wake_up(&job->hwctx->priv->job_free_wq);
if (job->out_fence)
dma_fence_put(job->out_fence);
+ kfree(job->aie2_job_health);
kfree(job);
}
@@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
aie2_job_put(job);
}
+static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
+{
+ struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
+ struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ struct app_health_report *report = job->aie2_job_health;
+ u32 fail_cmd_idx = 0;
+
+ if (!report)
+ goto set_timeout;
+
+ XDNA_ERR(xdna, "Firmware timeout state capture:");
+ XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
+ XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
+ XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
+ XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
+ XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
+ XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
+ XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type);
+ XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type);
+ XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc);
+ XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module);
+ XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index);
+ XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
+
+ fail_cmd_idx = report->run_list_id;
+ aie2_health = kzalloc_obj(*aie2_health);
+ if (!aie2_health)
+ goto set_timeout;
+
+ aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
+ aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
+ aie2_health->txn_op_idx = report->txn_op_id;
+ aie2_health->ctx_pc = report->ctx_pc;
+ aie2_health->fatal_error_type = report->fatal_info.fatal_type;
+ aie2_health->fatal_error_exception_type = report->fatal_info.exception_type;
+ aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
+ aie2_health->fatal_error_app_module = report->fatal_info.app_module;
+
+set_timeout:
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
+ aie2_health, sizeof(*aie2_health));
+}
+
static int
aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
{
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+ aie2_set_cmd_timeout(job);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32))) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
ret = -EINVAL;
goto out;
}
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
if (status == AIE2_STATUS_SUCCESS)
amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
else
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);
out:
aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
struct amdxdna_sched_job *job = handle;
struct amdxdna_gem_obj *cmd_abo;
struct amdxdna_dev *xdna;
+ u32 fail_cmd_idx = 0;
u32 fail_cmd_status;
- u32 fail_cmd_idx;
u32 cmd_status;
int ret = 0;
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+ aie2_set_cmd_timeout(job);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
ret = -EINVAL;
goto out;
}
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
fail_cmd_idx, fail_cmd_status);
if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
- amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
ret = -EINVAL;
} else {
- amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
}
out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
{
struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
struct amdxdna_hwctx *hwctx = job->hwctx;
+ struct app_health_report *report;
struct amdxdna_dev *xdna;
+ int ret;
xdna = hwctx->client->xdna;
trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
job->job_timeout = true;
+
mutex_lock(&xdna->dev_lock);
+ report = kzalloc_obj(*report);
+ if (!report)
+ goto reset_hwctx;
+
+ ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
+ if (ret)
+ kfree(report);
+ else
+ job->aie2_job_health = report;
+
+reset_hwctx:
aie2_hwctx_stop(xdna, hwctx, sched_job);
aie2_hwctx_restart(xdna, hwctx);
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 798128b6b7b7..4ec591306854 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *
return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
}
+
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+ struct app_health_report *report)
+{
+ DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct app_health_report *buf;
+ dma_addr_t dma_addr;
+ u32 buf_size;
+ int ret;
+
+ if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
+ XDNA_DBG(xdna, "App health feature not supported");
+ return -EOPNOTSUPP;
+ }
+
+ buf_size = sizeof(*report);
+ buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
+ if (IS_ERR(buf)) {
+ XDNA_ERR(xdna, "Failed to allocate buffer for app health");
+ return PTR_ERR(buf);
+ }
+
+ req.buf_addr = dma_addr;
+ req.context_id = context_id;
+ req.buf_size = buf_size;
+
+ drm_clflush_virt_range(buf, sizeof(*report));
+ ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+ if (ret) {
+ XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
+ goto free_buf;
+ }
+
+ /* Copy the report to caller's buffer */
+ memcpy(report, buf, sizeof(*report));
+
+free_buf:
+ aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index 728ef56f7f0a..f18e89a39e35 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
MSG_OP_SET_RUNTIME_CONFIG = 0x10A,
MSG_OP_GET_RUNTIME_CONFIG = 0x10B,
MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
+ MSG_OP_GET_APP_HEALTH = 0x114,
MSG_OP_MAX_DRV_OPCODE,
MSG_OP_GET_PROTOCOL_VERSION = 0x301,
MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
struct config_debug_bo_resp {
enum aie2_msg_status status;
} __packed;
+
+struct fatal_error_info {
+ __u32 fatal_type; /* Fatal error type */
+ __u32 exception_type; /* Only valid if fatal_type is a specific value */
+ __u32 exception_argument; /* Argument based on exception type */
+ __u32 exception_pc; /* Program Counter at the time of the exception */
+ __u32 app_module; /* Error module name */
+ __u32 task_index; /* Index of the task in which the error occurred */
+ __u32 reserved[128];
+};
+
+struct app_health_report {
+ __u16 major;
+ __u16 minor;
+ __u32 size;
+ __u32 context_id;
+ /*
+ * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT
+ * application. Before execution begins or after successful completion, the value is set
+ * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
+ * opcode's PC value.
+ * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
+ * Proper interpretation requires familiarity with the implementation details.
+ */
+ __u32 dpu_pc;
+ /*
+ * Index of the last initiated TXN opcode.
+ * Before execution starts or after successful completion, the value is set to UINT_MAX.
+ * If execution halts prematurely due to an error, this field retains the opcode's ID.
+ * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
+ * Proper interpretation requires familiarity with the implementation details.
+ */
+ __u32 txn_op_id;
+ /* The PC of the context at the time of the report */
+ __u32 ctx_pc;
+ struct fatal_error_info fatal_info;
+ /* Index of the most recently executed run list entry. */
+ __u32 run_list_id;
+};
+
+struct get_app_health_req {
+ __u32 context_id;
+ __u32 buf_size;
+ __u64 buf_addr;
+} __packed;
+
+struct get_app_health_resp {
+ enum aie2_msg_status status;
+ __u32 required_buffer_size;
+ __u32 reserved[7];
+} __packed;
#endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index ddd3d82f3426..9e39bfe75971 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
struct amdxdna_drm_get_array *array_args = arg;
struct amdxdna_drm_hwctx_entry __user *buf;
+ struct app_health_report report;
+ struct amdxdna_dev_hdl *ndev;
u32 size;
+ int ret;
if (!array_args->num_element)
return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
tmp->latency = hwctx->qos.latency;
tmp->frame_exec_time = hwctx->qos.frame_exec_time;
tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
+ ndev = hwctx->client->xdna->dev_handle;
+ ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
+ if (!ret) {
+ /* Fill in app health report fields */
+ tmp->txn_op_idx = report.txn_op_id;
+ tmp->ctx_pc = report.ctx_pc;
+ tmp->fatal_error_type = report.fatal_info.fatal_type;
+ tmp->fatal_error_exception_type = report.fatal_info.exception_type;
+ tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
+ tmp->fatal_error_app_module = report.fatal_info.app_module;
+ }
buf = u64_to_user_ptr(array_args->buffer);
size = min(sizeof(*tmp), array_args->element_size);
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 885ae7e6bfc7..efcf4be035f0 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -10,6 +10,7 @@
#include <linux/limits.h>
#include <linux/semaphore.h>
+#include "aie2_msg_priv.h"
#include "amdxdna_mailbox.h"
#define AIE2_INTERVAL 20000 /* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
AIE2_NPU_COMMAND,
AIE2_PREEMPT,
AIE2_TEMPORAL_ONLY,
+ AIE2_APP_HEALTH,
AIE2_FEATURE_MAX
};
@@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
u32 min_minor;
};
+#define AIE2_ALL_FEATURES GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)
#define AIE2_FEATURE_ON(ndev, feature) test_bit(feature, &(ndev)->feature_mask)
struct amdxdna_dev_priv {
@@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver
int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
struct amdxdna_fw_ver *fw_ver);
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+ struct app_health_report *report);
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index 666dfd7b2a80..4b921715176d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
struct amdxdna_sched_job *job, u32 cmd_idx,
- enum ert_cmd_state error_state)
+ enum ert_cmd_state error_state,
+ void *err_data, size_t size)
{
struct amdxdna_client *client = job->hwctx->client;
struct amdxdna_cmd *cmd = abo->mem.kva;
@@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
}
memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+ if (err_data)
+ memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd)));
+
if (cc)
amdxdna_gem_put_obj(abo);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index fbdf9d000871..57db1527a93b 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
u32 prop_args[]; /* properties and regular kernel arguments */
};
+#define AMDXDNA_CMD_CTX_HEALTH_V1 1
+#define AMDXDNA_CMD_CTX_HEALTH_AIE2 0
+struct amdxdna_ctx_health {
+ u32 version;
+ u32 npu_gen;
+};
+
/* Exec buffer command header format */
#define AMDXDNA_CMD_STATE GENMASK(3, 0)
#define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
@@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
u32 result;
};
+struct app_health_report;
+union amdxdna_job_priv {
+ struct app_health_report *aie2_health;
+};
+
struct amdxdna_sched_job {
struct drm_sched_job base;
struct kref refcnt;
@@ -136,10 +148,13 @@ struct amdxdna_sched_job {
u64 seq;
struct amdxdna_drv_cmd *drv_cmd;
struct amdxdna_gem_obj *cmd_bo;
+ union amdxdna_job_priv priv;
size_t bo_cnt;
struct drm_gem_object *bos[] __counted_by(bo_cnt);
};
+#define aie2_job_health priv.aie2_health
+
static inline u32
amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
{
@@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
struct amdxdna_sched_job *job, u32 cmd_idx,
- enum ert_cmd_state error_state);
+ enum ert_cmd_state error_state,
+ void *err_data, size_t size);
void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index ce25eef5fc34..619bff042e52 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
{ .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
{ .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },
{ .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 },
- { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 },
+ { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
+ { .features = AIE2_ALL_FEATURES, .major = 7 },
{ 0 }
};
--
2.34.1
On 3/16/26 23:49, Lizhi Hou wrote:
> The firmware implements the GET_APP_HEALTH command to collect debug
> information for a specific hardware context.
>
> When a command times out, the driver issues this command to collect the
> relevant debug information. User space tools can also retrieve this
> information through the hardware context query IOCTL.
>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
> drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++---
> drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++
> drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
> drivers/accel/amdxdna/aie2_pci.c | 14 +++++
> drivers/accel/amdxdna/aie2_pci.h | 5 ++
> drivers/accel/amdxdna/amdxdna_ctx.c | 6 +-
> drivers/accel/amdxdna/amdxdna_ctx.h | 18 +++++-
> drivers/accel/amdxdna/npu4_regs.c | 3 +-
> 8 files changed, 213 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index 779ac70d62d7..6292349868c5 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
>
> #define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
>
> +struct aie2_ctx_health {
> + struct amdxdna_ctx_health header;
> + u32 txn_op_idx;
> + u32 ctx_pc;
> + u32 fatal_error_type;
> + u32 fatal_error_exception_type;
> + u32 fatal_error_exception_pc;
> + u32 fatal_error_app_module;
> +};
> +
> static void aie2_job_release(struct kref *ref)
> {
> struct amdxdna_sched_job *job;
> @@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
> wake_up(&job->hwctx->priv->job_free_wq);
> if (job->out_fence)
> dma_fence_put(job->out_fence);
> + kfree(job->aie2_job_health);
> kfree(job);
> }
>
> @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
> aie2_job_put(job);
> }
>
> +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
> +{
> + struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
> + struct amdxdna_dev *xdna = job->hwctx->client->xdna;
> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> + struct app_health_report *report = job->aie2_job_health;
> + u32 fail_cmd_idx = 0;
> +
> + if (!report)
> + goto set_timeout;
> +
> + XDNA_ERR(xdna, "Firmware timeout state capture:");
> + XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
> + XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
> + XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
> + XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
> + XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
> + XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
> + XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type);
> + XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type);
> + XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc);
> + XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module);
> + XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index);
> + XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
> +
> + fail_cmd_idx = report->run_list_id;
> + aie2_health = kzalloc_obj(*aie2_health);
> + if (!aie2_health)
> + goto set_timeout;
> +
> + aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
> + aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
> + aie2_health->txn_op_idx = report->txn_op_id;
> + aie2_health->ctx_pc = report->ctx_pc;
> + aie2_health->fatal_error_type = report->fatal_info.fatal_type;
> + aie2_health->fatal_error_exception_type = report->fatal_info.exception_type;
> + aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
> + aie2_health->fatal_error_app_module = report->fatal_info.app_module;
> +
> +set_timeout:
> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
> + aie2_health, sizeof(*aie2_health));
> +}
> +
> static int
> aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
> {
> @@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
> cmd_abo = job->cmd_bo;
>
> if (unlikely(job->job_timeout)) {
> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
> + aie2_set_cmd_timeout(job);
> ret = -EINVAL;
> goto out;
> }
>
> if (unlikely(!data) || unlikely(size != sizeof(u32))) {
> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
> ret = -EINVAL;
> goto out;
> }
> @@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
> if (status == AIE2_STATUS_SUCCESS)
> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
> else
> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);
>
> out:
> aie2_sched_notify(job);
> @@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
> struct amdxdna_sched_job *job = handle;
> struct amdxdna_gem_obj *cmd_abo;
> struct amdxdna_dev *xdna;
> + u32 fail_cmd_idx = 0;
> u32 fail_cmd_status;
> - u32 fail_cmd_idx;
> u32 cmd_status;
> int ret = 0;
>
> cmd_abo = job->cmd_bo;
>
> if (unlikely(job->job_timeout)) {
> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
> + aie2_set_cmd_timeout(job);
> ret = -EINVAL;
> goto out;
> }
>
> if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
> ret = -EINVAL;
> goto out;
> }
> @@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
> fail_cmd_idx, fail_cmd_status);
>
> if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
> - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
> ret = -EINVAL;
> } else {
> - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
> }
>
> out:
> @@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
> {
> struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
> struct amdxdna_hwctx *hwctx = job->hwctx;
> + struct app_health_report *report;
> struct amdxdna_dev *xdna;
> + int ret;
>
> xdna = hwctx->client->xdna;
> trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
> job->job_timeout = true;
> +
> mutex_lock(&xdna->dev_lock);
> + report = kzalloc_obj(*report);
> + if (!report)
> + goto reset_hwctx;
> +
> + ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
> + if (ret)
> + kfree(report);
> + else
> + job->aie2_job_health = report;
> +
> +reset_hwctx:
> aie2_hwctx_stop(xdna, hwctx, sched_job);
>
> aie2_hwctx_restart(xdna, hwctx);
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 798128b6b7b7..4ec591306854 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *
>
> return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
> }
> +
> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
> + struct app_health_report *report)
> +{
> + DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
> + struct amdxdna_dev *xdna = ndev->xdna;
> + struct app_health_report *buf;
> + dma_addr_t dma_addr;
> + u32 buf_size;
> + int ret;
> +
> + if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
> + XDNA_DBG(xdna, "App health feature not supported");
> + return -EOPNOTSUPP;
> + }
> +
> + buf_size = sizeof(*report);
> + buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
> + if (IS_ERR(buf)) {
> + XDNA_ERR(xdna, "Failed to allocate buffer for app health");
> + return PTR_ERR(buf);
> + }
> +
> + req.buf_addr = dma_addr;
> + req.context_id = context_id;
> + req.buf_size = buf_size;
> +
> + drm_clflush_virt_range(buf, sizeof(*report));
> + ret = aie2_send_mgmt_msg_wait(ndev, &msg);
> + if (ret) {
> + XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
> + goto free_buf;
> + }
> +
> + /* Copy the report to caller's buffer */
> + memcpy(report, buf, sizeof(*report));
> +
> +free_buf:
> + aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
> + return ret;
> +}
> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
> index 728ef56f7f0a..f18e89a39e35 100644
> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
> @@ -31,6 +31,7 @@ enum aie2_msg_opcode {
> MSG_OP_SET_RUNTIME_CONFIG = 0x10A,
> MSG_OP_GET_RUNTIME_CONFIG = 0x10B,
> MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
> + MSG_OP_GET_APP_HEALTH = 0x114,
> MSG_OP_MAX_DRV_OPCODE,
> MSG_OP_GET_PROTOCOL_VERSION = 0x301,
> MSG_OP_MAX_OPCODE
> @@ -451,4 +452,55 @@ struct config_debug_bo_req {
> struct config_debug_bo_resp {
> enum aie2_msg_status status;
> } __packed;
> +
> +struct fatal_error_info {
> + __u32 fatal_type; /* Fatal error type */
> + __u32 exception_type; /* Only valid if fatal_type is a specific value */
> + __u32 exception_argument; /* Argument based on exception type */
> + __u32 exception_pc; /* Program Counter at the time of the exception */
> + __u32 app_module; /* Error module name */
> + __u32 task_index; /* Index of the task in which the error occurred */
> + __u32 reserved[128];
> +};
> +
> +struct app_health_report {
> + __u16 major;
> + __u16 minor;
> + __u32 size;
> + __u32 context_id;
> + /*
> + * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT
> + * application. Before execution begins or after successful completion, the value is set
> + * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
> + * opcode's PC value.
> + * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
> + * Proper interpretation requires familiarity with the implementation details.
> + */
> + __u32 dpu_pc;
> + /*
> + * Index of the last initiated TXN opcode.
> + * Before execution starts or after successful completion, the value is set to UINT_MAX.
> + * If execution halts prematurely due to an error, this field retains the opcode's ID.
> + * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
> + * Proper interpretation requires familiarity with the implementation details.
> + */
> + __u32 txn_op_id;
> + /* The PC of the context at the time of the report */
> + __u32 ctx_pc;
> + struct fatal_error_info fatal_info;
> + /* Index of the most recently executed run list entry. */
> + __u32 run_list_id;
> +};
> +
> +struct get_app_health_req {
> + __u32 context_id;
> + __u32 buf_size;
> + __u64 buf_addr;
> +} __packed;
> +
> +struct get_app_health_resp {
> + enum aie2_msg_status status;
> + __u32 required_buffer_size;
> + __u32 reserved[7];
> +} __packed;
> #endif /* _AIE2_MSG_PRIV_H_ */
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index ddd3d82f3426..9e39bfe75971 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
> struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
> struct amdxdna_drm_get_array *array_args = arg;
> struct amdxdna_drm_hwctx_entry __user *buf;
> + struct app_health_report report;
> + struct amdxdna_dev_hdl *ndev;
> u32 size;
> + int ret;
>
> if (!array_args->num_element)
> return -EINVAL;
> @@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
> tmp->latency = hwctx->qos.latency;
> tmp->frame_exec_time = hwctx->qos.frame_exec_time;
> tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
> + ndev = hwctx->client->xdna->dev_handle;
> + ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
> + if (!ret) {
> + /* Fill in app health report fields */
> + tmp->txn_op_idx = report.txn_op_id;
> + tmp->ctx_pc = report.ctx_pc;
> + tmp->fatal_error_type = report.fatal_info.fatal_type;
> + tmp->fatal_error_exception_type = report.fatal_info.exception_type;
> + tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
> + tmp->fatal_error_app_module = report.fatal_info.app_module;
> + }
>
> buf = u64_to_user_ptr(array_args->buffer);
> size = min(sizeof(*tmp), array_args->element_size);
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 885ae7e6bfc7..efcf4be035f0 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -10,6 +10,7 @@
> #include <linux/limits.h>
> #include <linux/semaphore.h>
>
> +#include "aie2_msg_priv.h"
> #include "amdxdna_mailbox.h"
>
> #define AIE2_INTERVAL 20000 /* us */
> @@ -261,6 +262,7 @@ enum aie2_fw_feature {
> AIE2_NPU_COMMAND,
> AIE2_PREEMPT,
> AIE2_TEMPORAL_ONLY,
> + AIE2_APP_HEALTH,
> AIE2_FEATURE_MAX
> };
>
> @@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
> u32 min_minor;
> };
>
> +#define AIE2_ALL_FEATURES GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)
> #define AIE2_FEATURE_ON(ndev, feature) test_bit(feature, &(ndev)->feature_mask)
>
> struct amdxdna_dev_priv {
> @@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver
> int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
> int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
> struct amdxdna_fw_ver *fw_ver);
> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
> + struct app_health_report *report);
> int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
> int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
> index 666dfd7b2a80..4b921715176d 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
> @@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
>
> int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
> struct amdxdna_sched_job *job, u32 cmd_idx,
> - enum ert_cmd_state error_state)
> + enum ert_cmd_state error_state,
> + void *err_data, size_t size)
> {
> struct amdxdna_client *client = job->hwctx->client;
> struct amdxdna_cmd *cmd = abo->mem.kva;
> @@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
> }
>
> memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
> + if (err_data)
> + memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd)));
> +
> if (cc)
> amdxdna_gem_put_obj(abo);
>
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index fbdf9d000871..57db1527a93b 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
> u32 prop_args[]; /* properties and regular kernel arguments */
> };
>
> +#define AMDXDNA_CMD_CTX_HEALTH_V1 1
> +#define AMDXDNA_CMD_CTX_HEALTH_AIE2 0
> +struct amdxdna_ctx_health {
> + u32 version;
> + u32 npu_gen;
> +};
> +
> /* Exec buffer command header format */
> #define AMDXDNA_CMD_STATE GENMASK(3, 0)
> #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
> @@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
> u32 result;
> };
>
> +struct app_health_report;
> +union amdxdna_job_priv {
> + struct app_health_report *aie2_health;
> +};
> +
> struct amdxdna_sched_job {
> struct drm_sched_job base;
> struct kref refcnt;
> @@ -136,10 +148,13 @@ struct amdxdna_sched_job {
> u64 seq;
> struct amdxdna_drv_cmd *drv_cmd;
> struct amdxdna_gem_obj *cmd_bo;
> + union amdxdna_job_priv priv;
> size_t bo_cnt;
> struct drm_gem_object *bos[] __counted_by(bo_cnt);
> };
>
> +#define aie2_job_health priv.aie2_health
> +
> static inline u32
> amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
> {
> @@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
> u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
> int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
> struct amdxdna_sched_job *job, u32 cmd_idx,
> - enum ert_cmd_state error_state);
> + enum ert_cmd_state error_state,
> + void *err_data, size_t size);
>
> void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
> void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
> diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
> index ce25eef5fc34..619bff042e52 100644
> --- a/drivers/accel/amdxdna/npu4_regs.c
> +++ b/drivers/accel/amdxdna/npu4_regs.c
> @@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
> { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
> { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },
> { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 },
> - { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 },
> + { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
> + { .features = AIE2_ALL_FEATURES, .major = 7 },
> { 0 }
> };
>
Applied to drm-misc-next.
On 3/17/26 12:25, Mario Limonciello wrote:
>
>
> On 3/16/26 23:49, Lizhi Hou wrote:
>> The firmware implements the GET_APP_HEALTH command to collect debug
>> information for a specific hardware context.
>>
>> When a command times out, the driver issues this command to collect the
>> relevant debug information. User space tools can also retrieve this
>> information through the hardware context query IOCTL.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>> drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++---
>> drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++
>> drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
>> drivers/accel/amdxdna/aie2_pci.c | 14 +++++
>> drivers/accel/amdxdna/aie2_pci.h | 5 ++
>> drivers/accel/amdxdna/amdxdna_ctx.c | 6 +-
>> drivers/accel/amdxdna/amdxdna_ctx.h | 18 +++++-
>> drivers/accel/amdxdna/npu4_regs.c | 3 +-
>> 8 files changed, 213 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c
>> b/drivers/accel/amdxdna/aie2_ctx.c
>> index 779ac70d62d7..6292349868c5 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command
>> list (Default true)");
>> #define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
>> +struct aie2_ctx_health {
>> + struct amdxdna_ctx_health header;
>> + u32 txn_op_idx;
>> + u32 ctx_pc;
>> + u32 fatal_error_type;
>> + u32 fatal_error_exception_type;
>> + u32 fatal_error_exception_pc;
>> + u32 fatal_error_app_module;
>> +};
>> +
>> static void aie2_job_release(struct kref *ref)
>> {
>> struct amdxdna_sched_job *job;
>> @@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
>> wake_up(&job->hwctx->priv->job_free_wq);
>> if (job->out_fence)
>> dma_fence_put(job->out_fence);
>> + kfree(job->aie2_job_health);
>> kfree(job);
>> }
>> @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>> aie2_job_put(job);
>> }
>> +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
>> +{
>> + struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
>> + struct amdxdna_dev *xdna = job->hwctx->client->xdna;
>> + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> + struct app_health_report *report = job->aie2_job_health;
>> + u32 fail_cmd_idx = 0;
>> +
>> + if (!report)
>> + goto set_timeout;
>> +
>> + XDNA_ERR(xdna, "Firmware timeout state capture:");
>> + XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
>> + XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
>> + XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
>> + XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
>> + XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
>> + XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
>> + XDNA_ERR(xdna, "\tFatal error type: 0x%x",
>> report->fatal_info.fatal_type);
>> + XDNA_ERR(xdna, "\tFatal error exception type: 0x%x",
>> report->fatal_info.exception_type);
>> + XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x",
>> report->fatal_info.exception_pc);
>> + XDNA_ERR(xdna, "\tFatal error app module: 0x%x",
>> report->fatal_info.app_module);
>> + XDNA_ERR(xdna, "\tFatal error task ID: %d",
>> report->fatal_info.task_index);
>> + XDNA_ERR(xdna, "\tTimed out sub command ID: %d",
>> report->run_list_id);
>> +
>> + fail_cmd_idx = report->run_list_id;
>> + aie2_health = kzalloc_obj(*aie2_health);
>> + if (!aie2_health)
>> + goto set_timeout;
>> +
>> + aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
>> + aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
>> + aie2_health->txn_op_idx = report->txn_op_id;
>> + aie2_health->ctx_pc = report->ctx_pc;
>> + aie2_health->fatal_error_type = report->fatal_info.fatal_type;
>> + aie2_health->fatal_error_exception_type =
>> report->fatal_info.exception_type;
>> + aie2_health->fatal_error_exception_pc =
>> report->fatal_info.exception_pc;
>> + aie2_health->fatal_error_app_module =
>> report->fatal_info.app_module;
>> +
>> +set_timeout:
>> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
>> ERT_CMD_STATE_TIMEOUT,
>> + aie2_health, sizeof(*aie2_health));
>> +}
>> +
>> static int
>> aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>> {
>> @@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void
>> __iomem *data, size_t size)
>> cmd_abo = job->cmd_bo;
>> if (unlikely(job->job_timeout)) {
>> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
>> + aie2_set_cmd_timeout(job);
>> ret = -EINVAL;
>> goto out;
>> }
>> if (unlikely(!data) || unlikely(size != sizeof(u32))) {
>> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
>> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT,
>> NULL, 0);
>> ret = -EINVAL;
>> goto out;
>> }
>> @@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void
>> __iomem *data, size_t size)
>> if (status == AIE2_STATUS_SUCCESS)
>> amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
>> else
>> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
>> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR,
>> NULL, 0);
>> out:
>> aie2_sched_notify(job);
>> @@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle,
>> void __iomem *data, size_t size)
>> struct amdxdna_sched_job *job = handle;
>> struct amdxdna_gem_obj *cmd_abo;
>> struct amdxdna_dev *xdna;
>> + u32 fail_cmd_idx = 0;
>> u32 fail_cmd_status;
>> - u32 fail_cmd_idx;
>> u32 cmd_status;
>> int ret = 0;
>> cmd_abo = job->cmd_bo;
>> if (unlikely(job->job_timeout)) {
>> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
>> + aie2_set_cmd_timeout(job);
>> ret = -EINVAL;
>> goto out;
>> }
>> if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>> - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
>> + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT,
>> NULL, 0);
>> ret = -EINVAL;
>> goto out;
>> }
>> @@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle,
>> void __iomem *data, size_t size)
>> fail_cmd_idx, fail_cmd_status);
>> if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
>> - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
>> ERT_CMD_STATE_ABORT);
>> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
>> ERT_CMD_STATE_ABORT, NULL, 0);
>> ret = -EINVAL;
>> } else {
>> - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
>> ERT_CMD_STATE_ERROR);
>> + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
>> ERT_CMD_STATE_ERROR, NULL, 0);
>> }
>> out:
>> @@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job
>> *sched_job)
>> {
>> struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>> struct amdxdna_hwctx *hwctx = job->hwctx;
>> + struct app_health_report *report;
>> struct amdxdna_dev *xdna;
>> + int ret;
>> xdna = hwctx->client->xdna;
>> trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>> job->job_timeout = true;
>> +
>> mutex_lock(&xdna->dev_lock);
>> + report = kzalloc_obj(*report);
>> + if (!report)
>> + goto reset_hwctx;
>> +
>> + ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id,
>> report);
>> + if (ret)
>> + kfree(report);
>> + else
>> + job->aie2_job_health = report;
>> +
>> +reset_hwctx:
>> aie2_hwctx_stop(xdna, hwctx, sched_job);
>> aie2_hwctx_restart(xdna, hwctx);
>> diff --git a/drivers/accel/amdxdna/aie2_message.c
>> b/drivers/accel/amdxdna/aie2_message.c
>> index 798128b6b7b7..4ec591306854 100644
>> --- a/drivers/accel/amdxdna/aie2_message.c
>> +++ b/drivers/accel/amdxdna/aie2_message.c
>> @@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx
>> *hwctx, struct amdxdna_sched_job *
>> return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>> }
>> +
>> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
>> + struct app_health_report *report)
>> +{
>> + DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
>> + struct amdxdna_dev *xdna = ndev->xdna;
>> + struct app_health_report *buf;
>> + dma_addr_t dma_addr;
>> + u32 buf_size;
>> + int ret;
>> +
>> + if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
>> + XDNA_DBG(xdna, "App health feature not supported");
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + buf_size = sizeof(*report);
>> + buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
>> + if (IS_ERR(buf)) {
>> + XDNA_ERR(xdna, "Failed to allocate buffer for app health");
>> + return PTR_ERR(buf);
>> + }
>> +
>> + req.buf_addr = dma_addr;
>> + req.context_id = context_id;
>> + req.buf_size = buf_size;
>> +
>> + drm_clflush_virt_range(buf, sizeof(*report));
>> + ret = aie2_send_mgmt_msg_wait(ndev, &msg);
>> + if (ret) {
>> + XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x",
>> ret, resp.status);
>> + goto free_buf;
>> + }
>> +
>> + /* Copy the report to caller's buffer */
>> + memcpy(report, buf, sizeof(*report));
>> +
>> +free_buf:
>> + aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
>> + return ret;
>> +}
>> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h
>> b/drivers/accel/amdxdna/aie2_msg_priv.h
>> index 728ef56f7f0a..f18e89a39e35 100644
>> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
>> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
>> @@ -31,6 +31,7 @@ enum aie2_msg_opcode {
>> MSG_OP_SET_RUNTIME_CONFIG = 0x10A,
>> MSG_OP_GET_RUNTIME_CONFIG = 0x10B,
>> MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
>> + MSG_OP_GET_APP_HEALTH = 0x114,
>> MSG_OP_MAX_DRV_OPCODE,
>> MSG_OP_GET_PROTOCOL_VERSION = 0x301,
>> MSG_OP_MAX_OPCODE
>> @@ -451,4 +452,55 @@ struct config_debug_bo_req {
>> struct config_debug_bo_resp {
>> enum aie2_msg_status status;
>> } __packed;
>> +
>> +struct fatal_error_info {
>> + __u32 fatal_type; /* Fatal error type */
>> + __u32 exception_type; /* Only valid if fatal_type is a
>> specific value */
>> + __u32 exception_argument; /* Argument based on exception type */
>> + __u32 exception_pc; /* Program Counter at the time of the
>> exception */
>> + __u32 app_module; /* Error module name */
>> + __u32 task_index; /* Index of the task in which the
>> error occurred */
>> + __u32 reserved[128];
>> +};
>> +
>> +struct app_health_report {
>> + __u16 major;
>> + __u16 minor;
>> + __u32 size;
>> + __u32 context_id;
>> + /*
>> + * Program Counter (PC) of the last initiated DPU opcode, as
>> reported by the ERT
>> + * application. Before execution begins or after successful
>> completion, the value is set
>> + * to UINT_MAX. If execution halts prematurely due to an error,
>> this field retains the
>> + * opcode's PC value.
>> + * Note: To optimize performance, the ERT may simplify certain
>> aspects of reporting.
>> + * Proper interpretation requires familiarity with the
>> implementation details.
>> + */
>> + __u32 dpu_pc;
>> + /*
>> + * Index of the last initiated TXN opcode.
>> + * Before execution starts or after successful completion, the
>> value is set to UINT_MAX.
>> + * If execution halts prematurely due to an error, this field
>> retains the opcode's ID.
>> + * Note: To optimize performance, the ERT may simplify certain
>> aspects of reporting.
>> + * Proper interpretation requires familiarity with the
>> implementation details.
>> + */
>> + __u32 txn_op_id;
>> + /* The PC of the context at the time of the report */
>> + __u32 ctx_pc;
>> + struct fatal_error_info fatal_info;
>> + /* Index of the most recently executed run list entry. */
>> + __u32 run_list_id;
>> +};
>> +
>> +struct get_app_health_req {
>> + __u32 context_id;
>> + __u32 buf_size;
>> + __u64 buf_addr;
>> +} __packed;
>> +
>> +struct get_app_health_resp {
>> + enum aie2_msg_status status;
>> + __u32 required_buffer_size;
>> + __u32 reserved[7];
>> +} __packed;
>> #endif /* _AIE2_MSG_PRIV_H_ */
>> diff --git a/drivers/accel/amdxdna/aie2_pci.c
>> b/drivers/accel/amdxdna/aie2_pci.c
>> index ddd3d82f3426..9e39bfe75971 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.c
>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>> @@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct
>> amdxdna_hwctx *hwctx, void *arg)
>> struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
>> struct amdxdna_drm_get_array *array_args = arg;
>> struct amdxdna_drm_hwctx_entry __user *buf;
>> + struct app_health_report report;
>> + struct amdxdna_dev_hdl *ndev;
>> u32 size;
>> + int ret;
>> if (!array_args->num_element)
>> return -EINVAL;
>> @@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct
>> amdxdna_hwctx *hwctx, void *arg)
>> tmp->latency = hwctx->qos.latency;
>> tmp->frame_exec_time = hwctx->qos.frame_exec_time;
>> tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
>> + ndev = hwctx->client->xdna->dev_handle;
>> + ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
>> + if (!ret) {
>> + /* Fill in app health report fields */
>> + tmp->txn_op_idx = report.txn_op_id;
>> + tmp->ctx_pc = report.ctx_pc;
>> + tmp->fatal_error_type = report.fatal_info.fatal_type;
>> + tmp->fatal_error_exception_type =
>> report.fatal_info.exception_type;
>> + tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
>> + tmp->fatal_error_app_module = report.fatal_info.app_module;
>> + }
>> buf = u64_to_user_ptr(array_args->buffer);
>> size = min(sizeof(*tmp), array_args->element_size);
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index 885ae7e6bfc7..efcf4be035f0 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -10,6 +10,7 @@
>> #include <linux/limits.h>
>> #include <linux/semaphore.h>
>> +#include "aie2_msg_priv.h"
>> #include "amdxdna_mailbox.h"
>> #define AIE2_INTERVAL 20000 /* us */
>> @@ -261,6 +262,7 @@ enum aie2_fw_feature {
>> AIE2_NPU_COMMAND,
>> AIE2_PREEMPT,
>> AIE2_TEMPORAL_ONLY,
>> + AIE2_APP_HEALTH,
>> AIE2_FEATURE_MAX
>> };
>> @@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
>> u32 min_minor;
>> };
>> +#define AIE2_ALL_FEATURES GENMASK_ULL(AIE2_FEATURE_MAX - 1,
>> AIE2_NPU_COMMAND)
>> #define AIE2_FEATURE_ON(ndev, feature) test_bit(feature,
>> &(ndev)->feature_mask)
>> struct amdxdna_dev_priv {
>> @@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl
>> *ndev, struct aie_version *ver
>> int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct
>> aie_metadata *metadata);
>> int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
>> struct amdxdna_fw_ver *fw_ver);
>> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
>> + struct app_health_report *report);
>> int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct
>> amdxdna_hwctx *hwctx);
>> int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct
>> amdxdna_hwctx *hwctx);
>> int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id,
>> u64 addr, u64 size);
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c
>> b/drivers/accel/amdxdna/amdxdna_ctx.c
>> index 666dfd7b2a80..4b921715176d 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
>> @@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj
>> *abo)
>> int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>> struct amdxdna_sched_job *job, u32 cmd_idx,
>> - enum ert_cmd_state error_state)
>> + enum ert_cmd_state error_state,
>> + void *err_data, size_t size)
>> {
>> struct amdxdna_client *client = job->hwctx->client;
>> struct amdxdna_cmd *cmd = abo->mem.kva;
>> @@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj
>> *abo,
>> }
>> memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
>> + if (err_data)
>> + memcpy(cmd->data, err_data, min(size, abo->mem.size -
>> sizeof(*cmd)));
>> +
>> if (cc)
>> amdxdna_gem_put_obj(abo);
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h
>> b/drivers/accel/amdxdna/amdxdna_ctx.h
>> index fbdf9d000871..57db1527a93b 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>> @@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
>> u32 prop_args[]; /* properties and regular kernel arguments */
>> };
>> +#define AMDXDNA_CMD_CTX_HEALTH_V1 1
>> +#define AMDXDNA_CMD_CTX_HEALTH_AIE2 0
>> +struct amdxdna_ctx_health {
>> + u32 version;
>> + u32 npu_gen;
>> +};
>> +
>> /* Exec buffer command header format */
>> #define AMDXDNA_CMD_STATE GENMASK(3, 0)
>> #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
>> @@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
>> u32 result;
>> };
>> +struct app_health_report;
>> +union amdxdna_job_priv {
>> + struct app_health_report *aie2_health;
>> +};
>> +
>> struct amdxdna_sched_job {
>> struct drm_sched_job base;
>> struct kref refcnt;
>> @@ -136,10 +148,13 @@ struct amdxdna_sched_job {
>> u64 seq;
>> struct amdxdna_drv_cmd *drv_cmd;
>> struct amdxdna_gem_obj *cmd_bo;
>> + union amdxdna_job_priv priv;
>> size_t bo_cnt;
>> struct drm_gem_object *bos[] __counted_by(bo_cnt);
>> };
>> +#define aie2_job_health priv.aie2_health
>> +
>> static inline u32
>> amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
>> {
>> @@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct
>> amdxdna_gem_obj *abo, u32 *size);
>> u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
>> int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>> struct amdxdna_sched_job *job, u32 cmd_idx,
>> - enum ert_cmd_state error_state);
>> + enum ert_cmd_state error_state,
>> + void *err_data, size_t size);
>> void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
>> void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
>> diff --git a/drivers/accel/amdxdna/npu4_regs.c
>> b/drivers/accel/amdxdna/npu4_regs.c
>> index ce25eef5fc34..619bff042e52 100644
>> --- a/drivers/accel/amdxdna/npu4_regs.c
>> +++ b/drivers/accel/amdxdna/npu4_regs.c
>> @@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl
>> npu4_fw_feature_table[] = {
>> { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor
>> = 15 },
>> { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor =
>> 12 },
>> { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6,
>> .min_minor = 12 },
>> - { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND),
>> .major = 7 },
>> + { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor =
>> 18 },
>> + { .features = AIE2_ALL_FEATURES, .major = 7 },
>> { 0 }
>> };
>
© 2016 - 2026 Red Hat, Inc.