[PATCH V2] accel/amdxdna: Support retrieving hardware context debug information

Lizhi Hou posted 1 patch 2 weeks, 6 days ago
drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
drivers/accel/amdxdna/aie2_pci.h      |  5 ++
drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
drivers/accel/amdxdna/amdxdna_ctx.h   | 18 +++++-
drivers/accel/amdxdna/npu4_regs.c     |  3 +-
8 files changed, 213 insertions(+), 11 deletions(-)
[PATCH V2] accel/amdxdna: Support retrieving hardware context debug information
Posted by Lizhi Hou 2 weeks, 6 days ago
The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.

When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.

Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
 drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
 drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
 drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
 drivers/accel/amdxdna/aie2_pci.h      |  5 ++
 drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
 drivers/accel/amdxdna/amdxdna_ctx.h   | 18 +++++-
 drivers/accel/amdxdna/npu4_regs.c     |  3 +-
 8 files changed, 213 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 779ac70d62d7..6292349868c5 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
 
 #define HWCTX_MAX_TIMEOUT	60000 /* milliseconds */
 
+struct aie2_ctx_health {
+	struct amdxdna_ctx_health header;
+	u32 txn_op_idx;
+	u32 ctx_pc;
+	u32 fatal_error_type;
+	u32 fatal_error_exception_type;
+	u32 fatal_error_exception_pc;
+	u32 fatal_error_app_module;
+};
+
 static void aie2_job_release(struct kref *ref)
 {
 	struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
 	wake_up(&job->hwctx->priv->job_free_wq);
 	if (job->out_fence)
 		dma_fence_put(job->out_fence);
+	kfree(job->aie2_job_health);
 	kfree(job);
 }
 
@@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 	aie2_job_put(job);
 }
 
+static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
+{
+	struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
+	struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct app_health_report *report = job->aie2_job_health;
+	u32 fail_cmd_idx = 0;
+
+	if (!report)
+		goto set_timeout;
+
+	XDNA_ERR(xdna, "Firmware timeout state capture:");
+	XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
+	XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
+	XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
+	XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
+	XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
+	XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
+	XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type);
+	XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type);
+	XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc);
+	XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module);
+	XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index);
+	XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
+
+	fail_cmd_idx = report->run_list_id;
+	aie2_health = kzalloc_obj(*aie2_health);
+	if (!aie2_health)
+		goto set_timeout;
+
+	aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
+	aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
+	aie2_health->txn_op_idx = report->txn_op_id;
+	aie2_health->ctx_pc = report->ctx_pc;
+	aie2_health->fatal_error_type = report->fatal_info.fatal_type;
+	aie2_health->fatal_error_exception_type = report->fatal_info.exception_type;
+	aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
+	aie2_health->fatal_error_app_module = report->fatal_info.app_module;
+
+set_timeout:
+	amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
+			      aie2_health, sizeof(*aie2_health));
+}
+
 static int
 aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 {
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 	cmd_abo = job->cmd_bo;
 
 	if (unlikely(job->job_timeout)) {
-		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+		aie2_set_cmd_timeout(job);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
-		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 	if (status == AIE2_STATUS_SUCCESS)
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
 	else
-		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
+		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);
 
 out:
 	aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
 	struct amdxdna_sched_job *job = handle;
 	struct amdxdna_gem_obj *cmd_abo;
 	struct amdxdna_dev *xdna;
+	u32 fail_cmd_idx = 0;
 	u32 fail_cmd_status;
-	u32 fail_cmd_idx;
 	u32 cmd_status;
 	int ret = 0;
 
 	cmd_abo = job->cmd_bo;
 
 	if (unlikely(job->job_timeout)) {
-		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+		aie2_set_cmd_timeout(job);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
-		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
 		 fail_cmd_idx, fail_cmd_status);
 
 	if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
-		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
+		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
 		ret = -EINVAL;
 	} else {
-		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
+		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
 	}
 
 out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 {
 	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
 	struct amdxdna_hwctx *hwctx = job->hwctx;
+	struct app_health_report *report;
 	struct amdxdna_dev *xdna;
+	int ret;
 
 	xdna = hwctx->client->xdna;
 	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
 	job->job_timeout = true;
+
 	mutex_lock(&xdna->dev_lock);
+	report = kzalloc_obj(*report);
+	if (!report)
+		goto reset_hwctx;
+
+	ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
+	if (ret)
+		kfree(report);
+	else
+		job->aie2_job_health = report;
+
+reset_hwctx:
 	aie2_hwctx_stop(xdna, hwctx, sched_job);
 
 	aie2_hwctx_restart(xdna, hwctx);
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 798128b6b7b7..4ec591306854 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *
 
 	return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
 }
+
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+			  struct app_health_report *report)
+{
+	DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct app_health_report *buf;
+	dma_addr_t dma_addr;
+	u32 buf_size;
+	int ret;
+
+	if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
+		XDNA_DBG(xdna, "App health feature not supported");
+		return -EOPNOTSUPP;
+	}
+
+	buf_size = sizeof(*report);
+	buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
+	if (IS_ERR(buf)) {
+		XDNA_ERR(xdna, "Failed to allocate buffer for app health");
+		return PTR_ERR(buf);
+	}
+
+	req.buf_addr = dma_addr;
+	req.context_id = context_id;
+	req.buf_size = buf_size;
+
+	drm_clflush_virt_range(buf, sizeof(*report));
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret) {
+		XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
+		goto free_buf;
+	}
+
+	/* Copy the report to caller's buffer */
+	memcpy(report, buf, sizeof(*report));
+
+free_buf:
+	aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
+	return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index 728ef56f7f0a..f18e89a39e35 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
 	MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
 	MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
 	MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
+	MSG_OP_GET_APP_HEALTH              = 0x114,
 	MSG_OP_MAX_DRV_OPCODE,
 	MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
 	MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
 struct config_debug_bo_resp {
 	enum aie2_msg_status	status;
 } __packed;
+
+struct fatal_error_info {
+	__u32 fatal_type;         /* Fatal error type */
+	__u32 exception_type;     /* Only valid if fatal_type is a specific value */
+	__u32 exception_argument; /* Argument based on exception type */
+	__u32 exception_pc;       /* Program Counter at the time of the exception */
+	__u32 app_module;         /* Error module name */
+	__u32 task_index;         /* Index of the task in which the error occurred */
+	__u32 reserved[128];
+};
+
+struct app_health_report {
+	__u16 major;
+	__u16 minor;
+	__u32 size;
+	__u32 context_id;
+	/*
+	 * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT
+	 * application. Before execution begins or after successful completion, the value is set
+	 * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
+	 * opcode's PC value.
+	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
+	 * Proper interpretation requires familiarity with the implementation details.
+	 */
+	__u32 dpu_pc;
+	/*
+	 * Index of the last initiated TXN opcode.
+	 * Before execution starts or after successful completion, the value is set to UINT_MAX.
+	 * If execution halts prematurely due to an error, this field retains the opcode's ID.
+	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
+	 * Proper interpretation requires familiarity with the implementation details.
+	 */
+	__u32 txn_op_id;
+	/* The PC of the context at the time of the report */
+	__u32 ctx_pc;
+	struct fatal_error_info		fatal_info;
+	/* Index of the most recently executed run list entry. */
+	__u32 run_list_id;
+};
+
+struct get_app_health_req {
+	__u32 context_id;
+	__u32 buf_size;
+	__u64 buf_addr;
+} __packed;
+
+struct get_app_health_resp {
+	enum aie2_msg_status status;
+	__u32 required_buffer_size;
+	__u32 reserved[7];
+} __packed;
 #endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index ddd3d82f3426..9e39bfe75971 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
 	struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
 	struct amdxdna_drm_get_array *array_args = arg;
 	struct amdxdna_drm_hwctx_entry __user *buf;
+	struct app_health_report report;
+	struct amdxdna_dev_hdl *ndev;
 	u32 size;
+	int ret;
 
 	if (!array_args->num_element)
 		return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
 	tmp->latency = hwctx->qos.latency;
 	tmp->frame_exec_time = hwctx->qos.frame_exec_time;
 	tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
+	ndev = hwctx->client->xdna->dev_handle;
+	ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
+	if (!ret) {
+		/* Fill in app health report fields */
+		tmp->txn_op_idx = report.txn_op_id;
+		tmp->ctx_pc = report.ctx_pc;
+		tmp->fatal_error_type = report.fatal_info.fatal_type;
+		tmp->fatal_error_exception_type = report.fatal_info.exception_type;
+		tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
+		tmp->fatal_error_app_module = report.fatal_info.app_module;
+	}
 
 	buf = u64_to_user_ptr(array_args->buffer);
 	size = min(sizeof(*tmp), array_args->element_size);
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 885ae7e6bfc7..efcf4be035f0 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -10,6 +10,7 @@
 #include <linux/limits.h>
 #include <linux/semaphore.h>
 
+#include "aie2_msg_priv.h"
 #include "amdxdna_mailbox.h"
 
 #define AIE2_INTERVAL	20000	/* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
 	AIE2_NPU_COMMAND,
 	AIE2_PREEMPT,
 	AIE2_TEMPORAL_ONLY,
+	AIE2_APP_HEALTH,
 	AIE2_FEATURE_MAX
 };
 
@@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
 	u32 min_minor;
 };
 
+#define AIE2_ALL_FEATURES	GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)
 #define AIE2_FEATURE_ON(ndev, feature)	test_bit(feature, &(ndev)->feature_mask)
 
 struct amdxdna_dev_priv {
@@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver
 int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
 int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
 				struct amdxdna_fw_ver *fw_ver);
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+			  struct app_health_report *report);
 int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index 666dfd7b2a80..4b921715176d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
 
 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
 			  struct amdxdna_sched_job *job, u32 cmd_idx,
-			  enum ert_cmd_state error_state)
+			  enum ert_cmd_state error_state,
+			  void *err_data, size_t size)
 {
 	struct amdxdna_client *client = job->hwctx->client;
 	struct amdxdna_cmd *cmd = abo->mem.kva;
@@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
 	}
 
 	memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+	if (err_data)
+		memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd)));
+
 	if (cc)
 		amdxdna_gem_put_obj(abo);
 
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index fbdf9d000871..57db1527a93b 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
 	u32 prop_args[];    /* properties and regular kernel arguments */
 };
 
+#define AMDXDNA_CMD_CTX_HEALTH_V1	1
+#define AMDXDNA_CMD_CTX_HEALTH_AIE2	0
+struct amdxdna_ctx_health {
+	u32 version;
+	u32 npu_gen;
+};
+
 /* Exec buffer command header format */
 #define AMDXDNA_CMD_STATE		GENMASK(3, 0)
 #define AMDXDNA_CMD_EXTRA_CU_MASK	GENMASK(11, 10)
@@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
 	u32			result;
 };
 
+struct app_health_report;
+union amdxdna_job_priv {
+	struct app_health_report *aie2_health;
+};
+
 struct amdxdna_sched_job {
 	struct drm_sched_job	base;
 	struct kref		refcnt;
@@ -136,10 +148,13 @@ struct amdxdna_sched_job {
 	u64			seq;
 	struct amdxdna_drv_cmd	*drv_cmd;
 	struct amdxdna_gem_obj	*cmd_bo;
+	union amdxdna_job_priv	priv;
 	size_t			bo_cnt;
 	struct drm_gem_object	*bos[] __counted_by(bo_cnt);
 };
 
+#define aie2_job_health priv.aie2_health
+
 static inline u32
 amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
 {
@@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
 u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
 			  struct amdxdna_sched_job *job, u32 cmd_idx,
-			  enum ert_cmd_state error_state);
+			  enum ert_cmd_state error_state,
+			  void *err_data, size_t size);
 
 void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
 void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index ce25eef5fc34..619bff042e52 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
 	{ .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
 	{ .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },
 	{ .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 },
-	{ .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 },
+	{ .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
+	{ .features = AIE2_ALL_FEATURES, .major = 7 },
 	{ 0 }
 };
 
-- 
2.34.1
Re: [PATCH V2] accel/amdxdna: Support retrieving hardware context debug information
Posted by Mario Limonciello 2 weeks, 6 days ago

On 3/16/26 23:49, Lizhi Hou wrote:
> The firmware implements the GET_APP_HEALTH command to collect debug
> information for a specific hardware context.
> 
> When a command times out, the driver issues this command to collect the
> relevant debug information. User space tools can also retrieve this
> information through the hardware context query IOCTL.
> 
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
>   drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
>   drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
>   drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
>   drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
>   drivers/accel/amdxdna/aie2_pci.h      |  5 ++
>   drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
>   drivers/accel/amdxdna/amdxdna_ctx.h   | 18 +++++-
>   drivers/accel/amdxdna/npu4_regs.c     |  3 +-
>   8 files changed, 213 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
> index 779ac70d62d7..6292349868c5 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
>   
>   #define HWCTX_MAX_TIMEOUT	60000 /* milliseconds */
>   
> +struct aie2_ctx_health {
> +	struct amdxdna_ctx_health header;
> +	u32 txn_op_idx;
> +	u32 ctx_pc;
> +	u32 fatal_error_type;
> +	u32 fatal_error_exception_type;
> +	u32 fatal_error_exception_pc;
> +	u32 fatal_error_app_module;
> +};
> +
>   static void aie2_job_release(struct kref *ref)
>   {
>   	struct amdxdna_sched_job *job;
> @@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
>   	wake_up(&job->hwctx->priv->job_free_wq);
>   	if (job->out_fence)
>   		dma_fence_put(job->out_fence);
> +	kfree(job->aie2_job_health);
>   	kfree(job);
>   }
>   
> @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>   	aie2_job_put(job);
>   }
>   
> +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
> +{
> +	struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
> +	struct amdxdna_dev *xdna = job->hwctx->client->xdna;
> +	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
> +	struct app_health_report *report = job->aie2_job_health;
> +	u32 fail_cmd_idx = 0;
> +
> +	if (!report)
> +		goto set_timeout;
> +
> +	XDNA_ERR(xdna, "Firmware timeout state capture:");
> +	XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
> +	XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
> +	XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
> +	XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
> +	XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
> +	XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
> +	XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type);
> +	XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type);
> +	XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc);
> +	XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module);
> +	XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index);
> +	XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
> +
> +	fail_cmd_idx = report->run_list_id;
> +	aie2_health = kzalloc_obj(*aie2_health);
> +	if (!aie2_health)
> +		goto set_timeout;
> +
> +	aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
> +	aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
> +	aie2_health->txn_op_idx = report->txn_op_id;
> +	aie2_health->ctx_pc = report->ctx_pc;
> +	aie2_health->fatal_error_type = report->fatal_info.fatal_type;
> +	aie2_health->fatal_error_exception_type = report->fatal_info.exception_type;
> +	aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
> +	aie2_health->fatal_error_app_module = report->fatal_info.app_module;
> +
> +set_timeout:
> +	amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
> +			      aie2_health, sizeof(*aie2_health));
> +}
> +
>   static int
>   aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>   {
> @@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>   	cmd_abo = job->cmd_bo;
>   
>   	if (unlikely(job->job_timeout)) {
> -		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
> +		aie2_set_cmd_timeout(job);
>   		ret = -EINVAL;
>   		goto out;
>   	}
>   
>   	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
> -		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
> +		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
>   		ret = -EINVAL;
>   		goto out;
>   	}
> @@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>   	if (status == AIE2_STATUS_SUCCESS)
>   		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
>   	else
> -		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
> +		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);
>   
>   out:
>   	aie2_sched_notify(job);
> @@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
>   	struct amdxdna_sched_job *job = handle;
>   	struct amdxdna_gem_obj *cmd_abo;
>   	struct amdxdna_dev *xdna;
> +	u32 fail_cmd_idx = 0;
>   	u32 fail_cmd_status;
> -	u32 fail_cmd_idx;
>   	u32 cmd_status;
>   	int ret = 0;
>   
>   	cmd_abo = job->cmd_bo;
>   
>   	if (unlikely(job->job_timeout)) {
> -		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
> +		aie2_set_cmd_timeout(job);
>   		ret = -EINVAL;
>   		goto out;
>   	}
>   
>   	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
> -		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
> +		amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
>   		ret = -EINVAL;
>   		goto out;
>   	}
> @@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
>   		 fail_cmd_idx, fail_cmd_status);
>   
>   	if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
> -		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT);
> +		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
>   		ret = -EINVAL;
>   	} else {
> -		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR);
> +		amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
>   	}
>   
>   out:
> @@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
>   {
>   	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>   	struct amdxdna_hwctx *hwctx = job->hwctx;
> +	struct app_health_report *report;
>   	struct amdxdna_dev *xdna;
> +	int ret;
>   
>   	xdna = hwctx->client->xdna;
>   	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>   	job->job_timeout = true;
> +
>   	mutex_lock(&xdna->dev_lock);
> +	report = kzalloc_obj(*report);
> +	if (!report)
> +		goto reset_hwctx;
> +
> +	ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
> +	if (ret)
> +		kfree(report);
> +	else
> +		job->aie2_job_health = report;
> +
> +reset_hwctx:
>   	aie2_hwctx_stop(xdna, hwctx, sched_job);
>   
>   	aie2_hwctx_restart(xdna, hwctx);
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 798128b6b7b7..4ec591306854 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *
>   
>   	return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>   }
> +
> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
> +			  struct app_health_report *report)
> +{
> +	DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
> +	struct amdxdna_dev *xdna = ndev->xdna;
> +	struct app_health_report *buf;
> +	dma_addr_t dma_addr;
> +	u32 buf_size;
> +	int ret;
> +
> +	if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
> +		XDNA_DBG(xdna, "App health feature not supported");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	buf_size = sizeof(*report);
> +	buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
> +	if (IS_ERR(buf)) {
> +		XDNA_ERR(xdna, "Failed to allocate buffer for app health");
> +		return PTR_ERR(buf);
> +	}
> +
> +	req.buf_addr = dma_addr;
> +	req.context_id = context_id;
> +	req.buf_size = buf_size;
> +
> +	drm_clflush_virt_range(buf, sizeof(*report));
> +	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
> +	if (ret) {
> +		XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
> +		goto free_buf;
> +	}
> +
> +	/* Copy the report to caller's buffer */
> +	memcpy(report, buf, sizeof(*report));
> +
> +free_buf:
> +	aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
> +	return ret;
> +}
> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
> index 728ef56f7f0a..f18e89a39e35 100644
> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
> @@ -31,6 +31,7 @@ enum aie2_msg_opcode {
>   	MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
>   	MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
>   	MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
> +	MSG_OP_GET_APP_HEALTH              = 0x114,
>   	MSG_OP_MAX_DRV_OPCODE,
>   	MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
>   	MSG_OP_MAX_OPCODE
> @@ -451,4 +452,55 @@ struct config_debug_bo_req {
>   struct config_debug_bo_resp {
>   	enum aie2_msg_status	status;
>   } __packed;
> +
> +struct fatal_error_info {
> +	__u32 fatal_type;         /* Fatal error type */
> +	__u32 exception_type;     /* Only valid if fatal_type is a specific value */
> +	__u32 exception_argument; /* Argument based on exception type */
> +	__u32 exception_pc;       /* Program Counter at the time of the exception */
> +	__u32 app_module;         /* Error module name */
> +	__u32 task_index;         /* Index of the task in which the error occurred */
> +	__u32 reserved[128];
> +};
> +
> +struct app_health_report {
> +	__u16 major;
> +	__u16 minor;
> +	__u32 size;
> +	__u32 context_id;
> +	/*
> +	 * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT
> +	 * application. Before execution begins or after successful completion, the value is set
> +	 * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
> +	 * opcode's PC value.
> +	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
> +	 * Proper interpretation requires familiarity with the implementation details.
> +	 */
> +	__u32 dpu_pc;
> +	/*
> +	 * Index of the last initiated TXN opcode.
> +	 * Before execution starts or after successful completion, the value is set to UINT_MAX.
> +	 * If execution halts prematurely due to an error, this field retains the opcode's ID.
> +	 * Note: To optimize performance, the ERT may simplify certain aspects of reporting.
> +	 * Proper interpretation requires familiarity with the implementation details.
> +	 */
> +	__u32 txn_op_id;
> +	/* The PC of the context at the time of the report */
> +	__u32 ctx_pc;
> +	struct fatal_error_info		fatal_info;
> +	/* Index of the most recently executed run list entry. */
> +	__u32 run_list_id;
> +};
> +
> +struct get_app_health_req {
> +	__u32 context_id;
> +	__u32 buf_size;
> +	__u64 buf_addr;
> +} __packed;
> +
> +struct get_app_health_resp {
> +	enum aie2_msg_status status;
> +	__u32 required_buffer_size;
> +	__u32 reserved[7];
> +} __packed;
>   #endif /* _AIE2_MSG_PRIV_H_ */
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index ddd3d82f3426..9e39bfe75971 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
>   	struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
>   	struct amdxdna_drm_get_array *array_args = arg;
>   	struct amdxdna_drm_hwctx_entry __user *buf;
> +	struct app_health_report report;
> +	struct amdxdna_dev_hdl *ndev;
>   	u32 size;
> +	int ret;
>   
>   	if (!array_args->num_element)
>   		return -EINVAL;
> @@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
>   	tmp->latency = hwctx->qos.latency;
>   	tmp->frame_exec_time = hwctx->qos.frame_exec_time;
>   	tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
> +	ndev = hwctx->client->xdna->dev_handle;
> +	ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
> +	if (!ret) {
> +		/* Fill in app health report fields */
> +		tmp->txn_op_idx = report.txn_op_id;
> +		tmp->ctx_pc = report.ctx_pc;
> +		tmp->fatal_error_type = report.fatal_info.fatal_type;
> +		tmp->fatal_error_exception_type = report.fatal_info.exception_type;
> +		tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
> +		tmp->fatal_error_app_module = report.fatal_info.app_module;
> +	}
>   
>   	buf = u64_to_user_ptr(array_args->buffer);
>   	size = min(sizeof(*tmp), array_args->element_size);
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index 885ae7e6bfc7..efcf4be035f0 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -10,6 +10,7 @@
>   #include <linux/limits.h>
>   #include <linux/semaphore.h>
>   
> +#include "aie2_msg_priv.h"
>   #include "amdxdna_mailbox.h"
>   
>   #define AIE2_INTERVAL	20000	/* us */
> @@ -261,6 +262,7 @@ enum aie2_fw_feature {
>   	AIE2_NPU_COMMAND,
>   	AIE2_PREEMPT,
>   	AIE2_TEMPORAL_ONLY,
> +	AIE2_APP_HEALTH,
>   	AIE2_FEATURE_MAX
>   };
>   
> @@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
>   	u32 min_minor;
>   };
>   
> +#define AIE2_ALL_FEATURES	GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)
>   #define AIE2_FEATURE_ON(ndev, feature)	test_bit(feature, &(ndev)->feature_mask)
>   
>   struct amdxdna_dev_priv {
> @@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver
>   int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
>   int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
>   				struct amdxdna_fw_ver *fw_ver);
> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
> +			  struct app_health_report *report);
>   int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
>   int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
>   int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
> index 666dfd7b2a80..4b921715176d 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
> @@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
>   
>   int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>   			  struct amdxdna_sched_job *job, u32 cmd_idx,
> -			  enum ert_cmd_state error_state)
> +			  enum ert_cmd_state error_state,
> +			  void *err_data, size_t size)
>   {
>   	struct amdxdna_client *client = job->hwctx->client;
>   	struct amdxdna_cmd *cmd = abo->mem.kva;
> @@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>   	}
>   
>   	memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
> +	if (err_data)
> +		memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd)));
> +
>   	if (cc)
>   		amdxdna_gem_put_obj(abo);
>   
> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
> index fbdf9d000871..57db1527a93b 100644
> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
> @@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
>   	u32 prop_args[];    /* properties and regular kernel arguments */
>   };
>   
> +#define AMDXDNA_CMD_CTX_HEALTH_V1	1
> +#define AMDXDNA_CMD_CTX_HEALTH_AIE2	0
> +struct amdxdna_ctx_health {
> +	u32 version;
> +	u32 npu_gen;
> +};
> +
>   /* Exec buffer command header format */
>   #define AMDXDNA_CMD_STATE		GENMASK(3, 0)
>   #define AMDXDNA_CMD_EXTRA_CU_MASK	GENMASK(11, 10)
> @@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
>   	u32			result;
>   };
>   
> +struct app_health_report;
> +union amdxdna_job_priv {
> +	struct app_health_report *aie2_health;
> +};
> +
>   struct amdxdna_sched_job {
>   	struct drm_sched_job	base;
>   	struct kref		refcnt;
> @@ -136,10 +148,13 @@ struct amdxdna_sched_job {
>   	u64			seq;
>   	struct amdxdna_drv_cmd	*drv_cmd;
>   	struct amdxdna_gem_obj	*cmd_bo;
> +	union amdxdna_job_priv	priv;
>   	size_t			bo_cnt;
>   	struct drm_gem_object	*bos[] __counted_by(bo_cnt);
>   };
>   
> +#define aie2_job_health priv.aie2_health
> +
>   static inline u32
>   amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
>   {
> @@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
>   u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
>   int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>   			  struct amdxdna_sched_job *job, u32 cmd_idx,
> -			  enum ert_cmd_state error_state);
> +			  enum ert_cmd_state error_state,
> +			  void *err_data, size_t size);
>   
>   void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
>   void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
> diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
> index ce25eef5fc34..619bff042e52 100644
> --- a/drivers/accel/amdxdna/npu4_regs.c
> +++ b/drivers/accel/amdxdna/npu4_regs.c
> @@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
>   	{ .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
>   	{ .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },
>   	{ .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 },
> -	{ .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 },
> +	{ .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
> +	{ .features = AIE2_ALL_FEATURES, .major = 7 },
>   	{ 0 }
>   };
>
Re: [PATCH V2] accel/amdxdna: Support retrieving hardware context debug information
Posted by Lizhi Hou 2 weeks, 6 days ago
Applied to drm-misc-next.

On 3/17/26 12:25, Mario Limonciello wrote:
>
>
> On 3/16/26 23:49, Lizhi Hou wrote:
>> The firmware implements the GET_APP_HEALTH command to collect debug
>> information for a specific hardware context.
>>
>> When a command times out, the driver issues this command to collect the
>> relevant debug information. User space tools can also retrieve this
>> information through the hardware context query IOCTL.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
>> ---
>>   drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
>>   drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
>>   drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
>>   drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
>>   drivers/accel/amdxdna/aie2_pci.h      |  5 ++
>>   drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
>>   drivers/accel/amdxdna/amdxdna_ctx.h   | 18 +++++-
>>   drivers/accel/amdxdna/npu4_regs.c     |  3 +-
>>   8 files changed, 213 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_ctx.c 
>> b/drivers/accel/amdxdna/aie2_ctx.c
>> index 779ac70d62d7..6292349868c5 100644
>> --- a/drivers/accel/amdxdna/aie2_ctx.c
>> +++ b/drivers/accel/amdxdna/aie2_ctx.c
>> @@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command 
>> list (Default true)");
>>     #define HWCTX_MAX_TIMEOUT    60000 /* milliseconds */
>>   +struct aie2_ctx_health {
>> +    struct amdxdna_ctx_health header;
>> +    u32 txn_op_idx;
>> +    u32 ctx_pc;
>> +    u32 fatal_error_type;
>> +    u32 fatal_error_exception_type;
>> +    u32 fatal_error_exception_pc;
>> +    u32 fatal_error_app_module;
>> +};
>> +
>>   static void aie2_job_release(struct kref *ref)
>>   {
>>       struct amdxdna_sched_job *job;
>> @@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
>>       wake_up(&job->hwctx->priv->job_free_wq);
>>       if (job->out_fence)
>>           dma_fence_put(job->out_fence);
>> +    kfree(job->aie2_job_health);
>>       kfree(job);
>>   }
>>   @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
>>       aie2_job_put(job);
>>   }
>>   +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
>> +{
>> +    struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
>> +    struct amdxdna_dev *xdna = job->hwctx->client->xdna;
>> +    struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
>> +    struct app_health_report *report = job->aie2_job_health;
>> +    u32 fail_cmd_idx = 0;
>> +
>> +    if (!report)
>> +        goto set_timeout;
>> +
>> +    XDNA_ERR(xdna, "Firmware timeout state capture:");
>> +    XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
>> +    XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
>> +    XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
>> +    XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
>> +    XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
>> +    XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
>> +    XDNA_ERR(xdna, "\tFatal error type: 0x%x", 
>> report->fatal_info.fatal_type);
>> +    XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", 
>> report->fatal_info.exception_type);
>> +    XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", 
>> report->fatal_info.exception_pc);
>> +    XDNA_ERR(xdna, "\tFatal error app module: 0x%x", 
>> report->fatal_info.app_module);
>> +    XDNA_ERR(xdna, "\tFatal error task ID: %d", 
>> report->fatal_info.task_index);
>> +    XDNA_ERR(xdna, "\tTimed out sub command ID: %d", 
>> report->run_list_id);
>> +
>> +    fail_cmd_idx = report->run_list_id;
>> +    aie2_health = kzalloc_obj(*aie2_health);
>> +    if (!aie2_health)
>> +        goto set_timeout;
>> +
>> +    aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
>> +    aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
>> +    aie2_health->txn_op_idx = report->txn_op_id;
>> +    aie2_health->ctx_pc = report->ctx_pc;
>> +    aie2_health->fatal_error_type = report->fatal_info.fatal_type;
>> +    aie2_health->fatal_error_exception_type = 
>> report->fatal_info.exception_type;
>> +    aie2_health->fatal_error_exception_pc = 
>> report->fatal_info.exception_pc;
>> +    aie2_health->fatal_error_app_module = 
>> report->fatal_info.app_module;
>> +
>> +set_timeout:
>> +    amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
>> ERT_CMD_STATE_TIMEOUT,
>> +                  aie2_health, sizeof(*aie2_health));
>> +}
>> +
>>   static int
>>   aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
>>   {
>> @@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void 
>> __iomem *data, size_t size)
>>       cmd_abo = job->cmd_bo;
>>         if (unlikely(job->job_timeout)) {
>> -        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
>> +        aie2_set_cmd_timeout(job);
>>           ret = -EINVAL;
>>           goto out;
>>       }
>>         if (unlikely(!data) || unlikely(size != sizeof(u32))) {
>> -        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
>> +        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, 
>> NULL, 0);
>>           ret = -EINVAL;
>>           goto out;
>>       }
>> @@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void 
>> __iomem *data, size_t size)
>>       if (status == AIE2_STATUS_SUCCESS)
>>           amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
>>       else
>> -        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
>> +        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, 
>> NULL, 0);
>>     out:
>>       aie2_sched_notify(job);
>> @@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, 
>> void __iomem *data, size_t size)
>>       struct amdxdna_sched_job *job = handle;
>>       struct amdxdna_gem_obj *cmd_abo;
>>       struct amdxdna_dev *xdna;
>> +    u32 fail_cmd_idx = 0;
>>       u32 fail_cmd_status;
>> -    u32 fail_cmd_idx;
>>       u32 cmd_status;
>>       int ret = 0;
>>         cmd_abo = job->cmd_bo;
>>         if (unlikely(job->job_timeout)) {
>> -        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
>> +        aie2_set_cmd_timeout(job);
>>           ret = -EINVAL;
>>           goto out;
>>       }
>>         if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
>> -        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
>> +        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, 
>> NULL, 0);
>>           ret = -EINVAL;
>>           goto out;
>>       }
>> @@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, 
>> void __iomem *data, size_t size)
>>            fail_cmd_idx, fail_cmd_status);
>>         if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
>> -        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
>> ERT_CMD_STATE_ABORT);
>> +        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
>> ERT_CMD_STATE_ABORT, NULL, 0);
>>           ret = -EINVAL;
>>       } else {
>> -        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
>> ERT_CMD_STATE_ERROR);
>> +        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
>> ERT_CMD_STATE_ERROR, NULL, 0);
>>       }
>>     out:
>> @@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job 
>> *sched_job)
>>   {
>>       struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
>>       struct amdxdna_hwctx *hwctx = job->hwctx;
>> +    struct app_health_report *report;
>>       struct amdxdna_dev *xdna;
>> +    int ret;
>>         xdna = hwctx->client->xdna;
>>       trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
>>       job->job_timeout = true;
>> +
>>       mutex_lock(&xdna->dev_lock);
>> +    report = kzalloc_obj(*report);
>> +    if (!report)
>> +        goto reset_hwctx;
>> +
>> +    ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, 
>> report);
>> +    if (ret)
>> +        kfree(report);
>> +    else
>> +        job->aie2_job_health = report;
>> +
>> +reset_hwctx:
>>       aie2_hwctx_stop(xdna, hwctx, sched_job);
>>         aie2_hwctx_restart(xdna, hwctx);
>> diff --git a/drivers/accel/amdxdna/aie2_message.c 
>> b/drivers/accel/amdxdna/aie2_message.c
>> index 798128b6b7b7..4ec591306854 100644
>> --- a/drivers/accel/amdxdna/aie2_message.c
>> +++ b/drivers/accel/amdxdna/aie2_message.c
>> @@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx 
>> *hwctx, struct amdxdna_sched_job *
>>         return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
>>   }
>> +
>> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
>> +              struct app_health_report *report)
>> +{
>> +    DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
>> +    struct amdxdna_dev *xdna = ndev->xdna;
>> +    struct app_health_report *buf;
>> +    dma_addr_t dma_addr;
>> +    u32 buf_size;
>> +    int ret;
>> +
>> +    if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
>> +        XDNA_DBG(xdna, "App health feature not supported");
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>> +    buf_size = sizeof(*report);
>> +    buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
>> +    if (IS_ERR(buf)) {
>> +        XDNA_ERR(xdna, "Failed to allocate buffer for app health");
>> +        return PTR_ERR(buf);
>> +    }
>> +
>> +    req.buf_addr = dma_addr;
>> +    req.context_id = context_id;
>> +    req.buf_size = buf_size;
>> +
>> +    drm_clflush_virt_range(buf, sizeof(*report));
>> +    ret = aie2_send_mgmt_msg_wait(ndev, &msg);
>> +    if (ret) {
>> +        XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", 
>> ret, resp.status);
>> +        goto free_buf;
>> +    }
>> +
>> +    /* Copy the report to caller's buffer */
>> +    memcpy(report, buf, sizeof(*report));
>> +
>> +free_buf:
>> +    aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
>> +    return ret;
>> +}
>> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h 
>> b/drivers/accel/amdxdna/aie2_msg_priv.h
>> index 728ef56f7f0a..f18e89a39e35 100644
>> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
>> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
>> @@ -31,6 +31,7 @@ enum aie2_msg_opcode {
>>       MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
>>       MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
>>       MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
>> +    MSG_OP_GET_APP_HEALTH              = 0x114,
>>       MSG_OP_MAX_DRV_OPCODE,
>>       MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
>>       MSG_OP_MAX_OPCODE
>> @@ -451,4 +452,55 @@ struct config_debug_bo_req {
>>   struct config_debug_bo_resp {
>>       enum aie2_msg_status    status;
>>   } __packed;
>> +
>> +struct fatal_error_info {
>> +    __u32 fatal_type;         /* Fatal error type */
>> +    __u32 exception_type;     /* Only valid if fatal_type is a 
>> specific value */
>> +    __u32 exception_argument; /* Argument based on exception type */
>> +    __u32 exception_pc;       /* Program Counter at the time of the 
>> exception */
>> +    __u32 app_module;         /* Error module name */
>> +    __u32 task_index;         /* Index of the task in which the 
>> error occurred */
>> +    __u32 reserved[128];
>> +};
>> +
>> +struct app_health_report {
>> +    __u16 major;
>> +    __u16 minor;
>> +    __u32 size;
>> +    __u32 context_id;
>> +    /*
>> +     * Program Counter (PC) of the last initiated DPU opcode, as 
>> reported by the ERT
>> +     * application. Before execution begins or after successful 
>> completion, the value is set
>> +     * to UINT_MAX. If execution halts prematurely due to an error, 
>> this field retains the
>> +     * opcode's PC value.
>> +     * Note: To optimize performance, the ERT may simplify certain 
>> aspects of reporting.
>> +     * Proper interpretation requires familiarity with the 
>> implementation details.
>> +     */
>> +    __u32 dpu_pc;
>> +    /*
>> +     * Index of the last initiated TXN opcode.
>> +     * Before execution starts or after successful completion, the 
>> value is set to UINT_MAX.
>> +     * If execution halts prematurely due to an error, this field 
>> retains the opcode's ID.
>> +     * Note: To optimize performance, the ERT may simplify certain 
>> aspects of reporting.
>> +     * Proper interpretation requires familiarity with the 
>> implementation details.
>> +     */
>> +    __u32 txn_op_id;
>> +    /* The PC of the context at the time of the report */
>> +    __u32 ctx_pc;
>> +    struct fatal_error_info        fatal_info;
>> +    /* Index of the most recently executed run list entry. */
>> +    __u32 run_list_id;
>> +};
>> +
>> +struct get_app_health_req {
>> +    __u32 context_id;
>> +    __u32 buf_size;
>> +    __u64 buf_addr;
>> +} __packed;
>> +
>> +struct get_app_health_resp {
>> +    enum aie2_msg_status status;
>> +    __u32 required_buffer_size;
>> +    __u32 reserved[7];
>> +} __packed;
>>   #endif /* _AIE2_MSG_PRIV_H_ */
>> diff --git a/drivers/accel/amdxdna/aie2_pci.c 
>> b/drivers/accel/amdxdna/aie2_pci.c
>> index ddd3d82f3426..9e39bfe75971 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.c
>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>> @@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct 
>> amdxdna_hwctx *hwctx, void *arg)
>>       struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
>>       struct amdxdna_drm_get_array *array_args = arg;
>>       struct amdxdna_drm_hwctx_entry __user *buf;
>> +    struct app_health_report report;
>> +    struct amdxdna_dev_hdl *ndev;
>>       u32 size;
>> +    int ret;
>>         if (!array_args->num_element)
>>           return -EINVAL;
>> @@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct 
>> amdxdna_hwctx *hwctx, void *arg)
>>       tmp->latency = hwctx->qos.latency;
>>       tmp->frame_exec_time = hwctx->qos.frame_exec_time;
>>       tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
>> +    ndev = hwctx->client->xdna->dev_handle;
>> +    ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
>> +    if (!ret) {
>> +        /* Fill in app health report fields */
>> +        tmp->txn_op_idx = report.txn_op_id;
>> +        tmp->ctx_pc = report.ctx_pc;
>> +        tmp->fatal_error_type = report.fatal_info.fatal_type;
>> +        tmp->fatal_error_exception_type = 
>> report.fatal_info.exception_type;
>> +        tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
>> +        tmp->fatal_error_app_module = report.fatal_info.app_module;
>> +    }
>>         buf = u64_to_user_ptr(array_args->buffer);
>>       size = min(sizeof(*tmp), array_args->element_size);
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h 
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index 885ae7e6bfc7..efcf4be035f0 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -10,6 +10,7 @@
>>   #include <linux/limits.h>
>>   #include <linux/semaphore.h>
>>   +#include "aie2_msg_priv.h"
>>   #include "amdxdna_mailbox.h"
>>     #define AIE2_INTERVAL    20000    /* us */
>> @@ -261,6 +262,7 @@ enum aie2_fw_feature {
>>       AIE2_NPU_COMMAND,
>>       AIE2_PREEMPT,
>>       AIE2_TEMPORAL_ONLY,
>> +    AIE2_APP_HEALTH,
>>       AIE2_FEATURE_MAX
>>   };
>>   @@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
>>       u32 min_minor;
>>   };
>>   +#define AIE2_ALL_FEATURES    GENMASK_ULL(AIE2_FEATURE_MAX - 1, 
>> AIE2_NPU_COMMAND)
>>   #define AIE2_FEATURE_ON(ndev, feature)    test_bit(feature, 
>> &(ndev)->feature_mask)
>>     struct amdxdna_dev_priv {
>> @@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl 
>> *ndev, struct aie_version *ver
>>   int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct 
>> aie_metadata *metadata);
>>   int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
>>                   struct amdxdna_fw_ver *fw_ver);
>> +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
>> +              struct app_health_report *report);
>>   int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct 
>> amdxdna_hwctx *hwctx);
>>   int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct 
>> amdxdna_hwctx *hwctx);
>>   int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, 
>> u64 addr, u64 size);
>> diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c 
>> b/drivers/accel/amdxdna/amdxdna_ctx.c
>> index 666dfd7b2a80..4b921715176d 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.c
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.c
>> @@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj 
>> *abo)
>>     int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>>                 struct amdxdna_sched_job *job, u32 cmd_idx,
>> -              enum ert_cmd_state error_state)
>> +              enum ert_cmd_state error_state,
>> +              void *err_data, size_t size)
>>   {
>>       struct amdxdna_client *client = job->hwctx->client;
>>       struct amdxdna_cmd *cmd = abo->mem.kva;
>> @@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj 
>> *abo,
>>       }
>>         memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
>> +    if (err_data)
>> +        memcpy(cmd->data, err_data, min(size, abo->mem.size - 
>> sizeof(*cmd)));
>> +
>>       if (cc)
>>           amdxdna_gem_put_obj(abo);
>>   diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h 
>> b/drivers/accel/amdxdna/amdxdna_ctx.h
>> index fbdf9d000871..57db1527a93b 100644
>> --- a/drivers/accel/amdxdna/amdxdna_ctx.h
>> +++ b/drivers/accel/amdxdna/amdxdna_ctx.h
>> @@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
>>       u32 prop_args[];    /* properties and regular kernel arguments */
>>   };
>>   +#define AMDXDNA_CMD_CTX_HEALTH_V1    1
>> +#define AMDXDNA_CMD_CTX_HEALTH_AIE2    0
>> +struct amdxdna_ctx_health {
>> +    u32 version;
>> +    u32 npu_gen;
>> +};
>> +
>>   /* Exec buffer command header format */
>>   #define AMDXDNA_CMD_STATE        GENMASK(3, 0)
>>   #define AMDXDNA_CMD_EXTRA_CU_MASK    GENMASK(11, 10)
>> @@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
>>       u32            result;
>>   };
>>   +struct app_health_report;
>> +union amdxdna_job_priv {
>> +    struct app_health_report *aie2_health;
>> +};
>> +
>>   struct amdxdna_sched_job {
>>       struct drm_sched_job    base;
>>       struct kref        refcnt;
>> @@ -136,10 +148,13 @@ struct amdxdna_sched_job {
>>       u64            seq;
>>       struct amdxdna_drv_cmd    *drv_cmd;
>>       struct amdxdna_gem_obj    *cmd_bo;
>> +    union amdxdna_job_priv    priv;
>>       size_t            bo_cnt;
>>       struct drm_gem_object    *bos[] __counted_by(bo_cnt);
>>   };
>>   +#define aie2_job_health priv.aie2_health
>> +
>>   static inline u32
>>   amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
>>   {
>> @@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct 
>> amdxdna_gem_obj *abo, u32 *size);
>>   u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
>>   int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
>>                 struct amdxdna_sched_job *job, u32 cmd_idx,
>> -              enum ert_cmd_state error_state);
>> +              enum ert_cmd_state error_state,
>> +              void *err_data, size_t size);
>>     void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
>>   void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
>> diff --git a/drivers/accel/amdxdna/npu4_regs.c 
>> b/drivers/accel/amdxdna/npu4_regs.c
>> index ce25eef5fc34..619bff042e52 100644
>> --- a/drivers/accel/amdxdna/npu4_regs.c
>> +++ b/drivers/accel/amdxdna/npu4_regs.c
>> @@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl 
>> npu4_fw_feature_table[] = {
>>       { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor 
>> = 15 },
>>       { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 
>> 12 },
>>       { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, 
>> .min_minor = 12 },
>> -    { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), 
>> .major = 7 },
>> +    { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 
>> 18 },
>> +    { .features = AIE2_ALL_FEATURES, .major = 7 },
>>       { 0 }
>>   };
>