From: Nathan Lynch <nathan.lynch@amd.com>
SDXI implementations provide software with detailed information about
error conditions using a per-device ring buffer in system memory. When
an error condition is signaled via interrupt, the driver retrieves any
pending error log entries and reports them to the kernel log.
Co-developed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
---
drivers/dma/sdxi/error.c | 340 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/dma/sdxi/error.h | 16 +++
2 files changed, 356 insertions(+)
diff --git a/drivers/dma/sdxi/error.c b/drivers/dma/sdxi/error.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5e33f5989250352f6b081a3049b3b1f972c85a6
--- /dev/null
+++ b/drivers/dma/sdxi/error.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SDXI error reporting.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irqreturn.h>
+#include <linux/packing.h>
+#include <linux/types.h>
+
+#include "error.h"
+#include "mmio.h"
+#include "sdxi.h"
+
+/*
+ * The error log ring buffer size is configurable, but for now we fix
+ * it to 64 entries (which is the spec minimum).
+ */
+#define ERROR_LOG_ENTRIES 64
+#define ERROR_LOG_SZ (ERROR_LOG_ENTRIES * sizeof(struct sdxi_errlog_hd_ent))
+
+/* The "unpacked" counterpart to ERRLOG_HD_ENT. */
+struct errlog_entry {
+ u64 dsc_index;
+ u16 cxt_num;
+ u16 err_class;
+ u16 type;
+ u8 step;
+ u8 buf;
+ u8 sub_step;
+ u8 re;
+ bool vl;
+ bool cv;
+ bool div;
+ bool bv;
+};
+
+#define ERRLOG_ENTRY_FIELD(hi_, lo_, name_) \
+ PACKED_FIELD(hi_, lo_, struct errlog_entry, name_)
+#define ERRLOG_ENTRY_FLAG(nr_, name_) \
+ ERRLOG_ENTRY_FIELD(nr_, nr_, name_)
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)" */
+static const struct packed_field_u16 errlog_hd_ent_fields[] = {
+ ERRLOG_ENTRY_FLAG(0, vl),
+ ERRLOG_ENTRY_FIELD(13, 8, step),
+ ERRLOG_ENTRY_FIELD(26, 16, type),
+ ERRLOG_ENTRY_FLAG(32, cv),
+ ERRLOG_ENTRY_FLAG(33, div),
+ ERRLOG_ENTRY_FLAG(34, bv),
+ ERRLOG_ENTRY_FIELD(38, 36, buf),
+ ERRLOG_ENTRY_FIELD(43, 40, sub_step),
+ ERRLOG_ENTRY_FIELD(46, 44, re),
+ ERRLOG_ENTRY_FIELD(63, 48, cxt_num),
+ ERRLOG_ENTRY_FIELD(127, 64, dsc_index),
+ ERRLOG_ENTRY_FIELD(367, 352, err_class),
+};
+
+enum {
+ SDXI_PACKING_QUIRKS = QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST,
+};
+
+/*
+ * Refer to "(Flagged) Processing Step" and
+ * "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "step"
+ */
+enum errv_step {
+ ERRV_INT = 1,
+ ERRV_CXT_L2 = 2,
+ ERRV_CXT_L1 = 3,
+ ERRV_CXT_CTL = 4,
+ ERRV_CXT_STS = 5,
+ ERRV_WRT_IDX = 6,
+ ERRV_DSC_GEN = 7,
+ ERRV_DSC_CSB = 8,
+ ERRV_ATOMIC = 9,
+ ERRV_DSC_BUF = 10,
+ ERRV_DSC_AKEY = 11,
+ ERRV_FN_RKEY = 12,
+};
+
+static const char *const processing_steps[] = {
+ [ERRV_INT] = "Internal Error",
+ [ERRV_CXT_L2] = "Context Level 2 Table Entry - Translate, Read, Validate",
+ [ERRV_CXT_L1] = "Context Level 1 Table Entry - Translate, Read, Validate",
+ [ERRV_CXT_CTL] = "Context Control - Translate, Read, Validate",
+ [ERRV_CXT_STS] = "Context Status - Translate, Access, Validate",
+ [ERRV_WRT_IDX] = "Write_Index - Translate, Read, Validate",
+ [ERRV_DSC_GEN] = "Descriptor Entry - Translate, Access, Validate",
+ [ERRV_DSC_CSB] = "Descriptor CST_BLK - Translate, Access, Validate",
+ [ERRV_ATOMIC] = "Atomic Return Data - Translate, Access",
+ [ERRV_DSC_BUF] = "Descriptor: Data Buffer - Translate, Access",
+ [ERRV_DSC_AKEY] = "Descriptor AKey Lookup - Translate, Access, Validate",
+ [ERRV_FN_RKEY] = "Function RKey Lookup - Translate, Read, Validate",
+};
+
+static const char *step_str(enum errv_step step)
+{
+ const char *str = "reserved";
+
+ switch (step) {
+ case ERRV_INT ... ERRV_FN_RKEY:
+ str = processing_steps[step];
+ break;
+ }
+
+ return str;
+}
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "sub_step" */
+enum errv_sub_step {
+ SUB_STEP_OTHER = 0,
+ SUB_STEP_ATF = 1,
+ SUB_STEP_DAF = 2,
+ SUB_STEP_DVF = 3,
+};
+
+static const char * const processing_sub_steps[] = {
+ [SUB_STEP_OTHER] = "Other/unknown",
+ [SUB_STEP_ATF] = "Address Translation Failure",
+ [SUB_STEP_DAF] = "Data Access Failure",
+ [SUB_STEP_DVF] = "Data Validation Failure",
+};
+
+static const char *sub_step_str(enum errv_sub_step sub_step)
+{
+ const char *str = "reserved";
+
+ switch (sub_step) {
+ case SUB_STEP_OTHER ... SUB_STEP_DVF:
+ str = processing_sub_steps[sub_step];
+ break;
+ }
+
+ return str;
+}
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "re" */
+enum fn_reaction {
+ FN_REACT_INFORM = 0,
+ FN_REACT_CXT_STOP = 1,
+ FN_REACT_FN_STOP = 2,
+};
+
+static const char * const fn_reactions[] = {
+ [FN_REACT_INFORM] = "Informative, nothing stopped",
+ [FN_REACT_CXT_STOP] = "Context stopped",
+ [FN_REACT_FN_STOP] = "Function stopped",
+};
+
+static const char *reaction_str(enum fn_reaction reaction)
+{
+ const char *str = "reserved";
+
+ switch (reaction) {
+ case FN_REACT_INFORM ... FN_REACT_FN_STOP:
+ str = fn_reactions[reaction];
+ break;
+ }
+
+ return str;
+}
+
+static void sdxi_print_err(struct sdxi_dev *sdxi, u64 err_rd)
+{
+ struct errlog_entry ent;
+ size_t index;
+
+ index = err_rd % ERROR_LOG_ENTRIES;
+
+ unpack_fields(&sdxi->err_log[index], sizeof(sdxi->err_log[0]),
+ &ent, errlog_hd_ent_fields, SDXI_PACKING_QUIRKS);
+
+ if (!ent.vl) {
+ dev_err_ratelimited(sdxi_to_dev(sdxi),
+ "Ignoring error log entry with vl=0\n");
+ return;
+ }
+
+ if (ent.type != OP_TYPE_ERRLOG) {
+ dev_err_ratelimited(sdxi_to_dev(sdxi),
+ "Ignoring error log entry with type=%#x\n",
+ ent.type);
+ return;
+ }
+
+ sdxi_err(sdxi, "error log entry[%zu], MMIO_ERR_RD=%#llx:\n",
+ index, err_rd);
+ sdxi_err(sdxi, " re: %#x (%s)\n", ent.re, reaction_str(ent.re));
+ sdxi_err(sdxi, " step: %#x (%s)\n", ent.step, step_str(ent.step));
+ sdxi_err(sdxi, " sub_step: %#x (%s)\n",
+ ent.sub_step, sub_step_str(ent.sub_step));
+ sdxi_err(sdxi, " cv: %u div: %u bv: %u\n", ent.cv, ent.div, ent.bv);
+ if (ent.bv)
+ sdxi_err(sdxi, " buf: %u\n", ent.buf);
+ if (ent.cv)
+ sdxi_err(sdxi, " cxt_num: %#x\n", ent.cxt_num);
+ if (ent.div)
+ sdxi_err(sdxi, " dsc_index: %#llx\n", ent.dsc_index);
+ sdxi_err(sdxi, " err_class: %#x\n", ent.err_class);
+}
+
+/* Refer to "Error Log Processing by Software" */
+static irqreturn_t sdxi_irq_thread(int irq, void *data)
+{
+ struct sdxi_dev *sdxi = data;
+ u64 write_index;
+ u64 read_index;
+ u64 err_sts;
+
+ /* 1. Check MMIO_ERR_STS and perform any required remediation. */
+ err_sts = sdxi_read64(sdxi, SDXI_MMIO_ERR_STS);
+ if (!(err_sts & SDXI_MMIO_ERR_STS_STS_BIT))
+ return IRQ_HANDLED;
+
+ if (err_sts & SDXI_MMIO_ERR_STS_ERR_BIT) {
+ /*
+ * Assume this isn't recoverable; e.g. the error log
+ * isn't configured correctly. Don't clear
+ * SDXI_MMIO_ERR_STS before returning.
+ */
+ sdxi_err(sdxi, "attempted but failed to log errors\n");
+ sdxi_err(sdxi, "error log not functional\n");
+ return IRQ_HANDLED;
+ }
+
+ if (err_sts & SDXI_MMIO_ERR_STS_OVF_BIT)
+ sdxi_err(sdxi, "error log overflow, some entries lost\n");
+
+ /* 2. If MMIO_ERR_STS.sts is 1, then compute read_index. */
+ read_index = sdxi_read64(sdxi, SDXI_MMIO_ERR_RD);
+
+ /* 3. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_STS,
+ SDXI_MMIO_ERR_STS_STS_BIT |
+ SDXI_MMIO_ERR_STS_OVF_BIT |
+ SDXI_MMIO_ERR_STS_ERR_BIT);
+
+ /* 4. Compute write_index. */
+ write_index = sdxi_read64(sdxi, SDXI_MMIO_ERR_WRT);
+
+ /* 5. If the indexes are equal then exit. */
+ if (read_index == write_index)
+ return IRQ_HANDLED;
+
+ /* 6. While read_index < write_index... */
+ while (read_index < write_index) {
+
+ /*
+ * 7. and 8. Compute the real ring buffer index from
+ * read_index and process the entry.
+ */
+ sdxi_print_err(sdxi, read_index);
+
+ /* 9. Advance read_index. */
+ ++read_index;
+
+ /* 10. Return to step 6. */
+ }
+
+ /* 11. Write read_index to MMIO_ERR_RD. */
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, read_index);
+
+ return IRQ_HANDLED;
+}
+
+/* Refer to "Error Log Initialization" */
+int sdxi_error_init(struct sdxi_dev *sdxi)
+{
+ u64 reg;
+ int err;
+
+ /* 1. Clear MMIO_ERR_CFG. Error interrupts are inhibited until step 6. */
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
+
+ /* 2. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
+ reg = FIELD_PREP(SDXI_MMIO_ERR_STS_STS_BIT, 1) |
+ FIELD_PREP(SDXI_MMIO_ERR_STS_OVF_BIT, 1) |
+ FIELD_PREP(SDXI_MMIO_ERR_STS_ERR_BIT, 1);
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_STS, reg);
+
+ /* 3. Allocate memory for the error log ring buffer, initialize to zero. */
+ sdxi->err_log = dma_alloc_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+ &sdxi->err_log_dma, GFP_KERNEL);
+ if (!sdxi->err_log)
+ return -ENOMEM;
+
+ /*
+ * 4. Set MMIO_ERR_CTL.intr_en to 1 if interrupts on
+ * context-level errors are desired.
+ */
+ reg = sdxi_read64(sdxi, SDXI_MMIO_ERR_CTL);
+ FIELD_MODIFY(SDXI_MMIO_ERR_CTL_EN, ®, 1);
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_CTL, reg);
+
+ /*
+ * The spec is not explicit about when to do this, but this
+ * seems like the right time: enable interrupt on
+ * function-level transition to error state.
+ */
+ reg = sdxi_read64(sdxi, SDXI_MMIO_CTL0);
+ FIELD_MODIFY(SDXI_MMIO_CTL0_FN_ERR_INTR_EN, ®, 1);
+ sdxi_write64(sdxi, SDXI_MMIO_CTL0, reg);
+
+ /* 5. Clear MMIO_ERR_WRT and MMIO_ERR_RD. */
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_WRT, 0);
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, 0);
+
+ /*
+ * Error interrupts can be generated once MMIO_ERR_CFG.en is
+ * set in step 6, so set up the handler now.
+ */
+ err = request_threaded_irq(sdxi->error_irq, NULL, sdxi_irq_thread,
+ IRQF_TRIGGER_NONE, "SDXI error", sdxi);
+ if (err)
+ goto free_errlog;
+
+ /* 6. Program MMIO_ERR_CFG. */
+ reg = FIELD_PREP(SDXI_MMIO_ERR_CFG_PTR, sdxi->err_log_dma >> 12) |
+ FIELD_PREP(SDXI_MMIO_ERR_CFG_SZ, ERROR_LOG_ENTRIES >> 6) |
+ FIELD_PREP(SDXI_MMIO_ERR_CFG_EN, 1);
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, reg);
+
+ return 0;
+
+free_errlog:
+ dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+ sdxi->err_log, sdxi->err_log_dma);
+ return err;
+}
+
+void sdxi_error_exit(struct sdxi_dev *sdxi)
+{
+ sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
+ free_irq(sdxi->error_irq, sdxi);
+ dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+ sdxi->err_log, sdxi->err_log_dma);
+}
diff --git a/drivers/dma/sdxi/error.h b/drivers/dma/sdxi/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..50019d9811184464227ae13baa509101a2a3aacc
--- /dev/null
+++ b/drivers/dma/sdxi/error.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SDXI error handling entry points.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef DMA_SDXI_ERROR_H
+#define DMA_SDXI_ERROR_H
+
+struct sdxi_dev;
+
+int sdxi_error_init(struct sdxi_dev *sdxi);
+void sdxi_error_exit(struct sdxi_dev *sdxi);
+
+#endif /* DMA_SDXI_ERROR_H */
--
2.39.5
On Fri, 05 Sep 2025 13:48:29 -0500
Nathan Lynch via B4 Relay <devnull+nathan.lynch.amd.com@kernel.org> wrote:
> From: Nathan Lynch <nathan.lynch@amd.com>
>
> SDXI implementations provide software with detailed information about
> error conditions using a per-device ring buffer in system memory. When
> an error condition is signaled via interrupt, the driver retrieves any
> pending error log entries and reports them to the kernel log.
>
> Co-developed-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Wei Huang <wei.huang2@amd.com>
> Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
Hi,
A few more comments inline. Kind of similar stuff around
having both register definitions for unpacking and the structure
definitions in patch 2.
Thanks,
Jonathan
> ---
> drivers/dma/sdxi/error.c | 340 +++++++++++++++++++++++++++++++++++++++++++++++
> drivers/dma/sdxi/error.h | 16 +++
> 2 files changed, 356 insertions(+)
>
> diff --git a/drivers/dma/sdxi/error.c b/drivers/dma/sdxi/error.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c5e33f5989250352f6b081a3049b3b1f972c85a6
> --- /dev/null
> +++ b/drivers/dma/sdxi/error.c
> +/* The "unpacked" counterpart to ERRLOG_HD_ENT. */
> +struct errlog_entry {
> + u64 dsc_index;
> + u16 cxt_num;
> + u16 err_class;
> + u16 type;
> + u8 step;
> + u8 buf;
> + u8 sub_step;
> + u8 re;
> + bool vl;
> + bool cv;
> + bool div;
> + bool bv;
> +};
> +
> +#define ERRLOG_ENTRY_FIELD(hi_, lo_, name_) \
> + PACKED_FIELD(hi_, lo_, struct errlog_entry, name_)
> +#define ERRLOG_ENTRY_FLAG(nr_, name_) \
> + ERRLOG_ENTRY_FIELD(nr_, nr_, name_)
> +
> +/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)" */
> +static const struct packed_field_u16 errlog_hd_ent_fields[] = {
> + ERRLOG_ENTRY_FLAG(0, vl),
> + ERRLOG_ENTRY_FIELD(13, 8, step),
> + ERRLOG_ENTRY_FIELD(26, 16, type),
> + ERRLOG_ENTRY_FLAG(32, cv),
> + ERRLOG_ENTRY_FLAG(33, div),
> + ERRLOG_ENTRY_FLAG(34, bv),
> + ERRLOG_ENTRY_FIELD(38, 36, buf),
> + ERRLOG_ENTRY_FIELD(43, 40, sub_step),
> + ERRLOG_ENTRY_FIELD(46, 44, re),
> + ERRLOG_ENTRY_FIELD(63, 48, cxt_num),
> + ERRLOG_ENTRY_FIELD(127, 64, dsc_index),
> + ERRLOG_ENTRY_FIELD(367, 352, err_class),
The association between the fields here and struct sdxi_err_log_hd_ent
to me should be via some defines in patch 2 for the various fields
embedded in misc0 etc.
> +};
> +static void sdxi_print_err(struct sdxi_dev *sdxi, u64 err_rd)
> +{
> + struct errlog_entry ent;
> + size_t index;
> +
> + index = err_rd % ERROR_LOG_ENTRIES;
> +
> + unpack_fields(&sdxi->err_log[index], sizeof(sdxi->err_log[0]),
> + &ent, errlog_hd_ent_fields, SDXI_PACKING_QUIRKS);
> +
> + if (!ent.vl) {
> + dev_err_ratelimited(sdxi_to_dev(sdxi),
> + "Ignoring error log entry with vl=0\n");
> + return;
> + }
> +
> + if (ent.type != OP_TYPE_ERRLOG) {
> + dev_err_ratelimited(sdxi_to_dev(sdxi),
> + "Ignoring error log entry with type=%#x\n",
> + ent.type);
> + return;
> + }
> +
> + sdxi_err(sdxi, "error log entry[%zu], MMIO_ERR_RD=%#llx:\n",
> + index, err_rd);
> + sdxi_err(sdxi, " re: %#x (%s)\n", ent.re, reaction_str(ent.re));
> + sdxi_err(sdxi, " step: %#x (%s)\n", ent.step, step_str(ent.step));
> + sdxi_err(sdxi, " sub_step: %#x (%s)\n",
> + ent.sub_step, sub_step_str(ent.sub_step));
> + sdxi_err(sdxi, " cv: %u div: %u bv: %u\n", ent.cv, ent.div, ent.bv);
> + if (ent.bv)
> + sdxi_err(sdxi, " buf: %u\n", ent.buf);
> + if (ent.cv)
> + sdxi_err(sdxi, " cxt_num: %#x\n", ent.cxt_num);
> + if (ent.div)
> + sdxi_err(sdxi, " dsc_index: %#llx\n", ent.dsc_index);
> + sdxi_err(sdxi, " err_class: %#x\n", ent.err_class);
Consider using tracepoints for error logging rather than large splats in the
log. Maybe you add those in later patches!
I'd then just fill the tracepoint in directly rather than have an unpacking
step.
> +}
> +/* Refer to "Error Log Initialization" */
> +int sdxi_error_init(struct sdxi_dev *sdxi)
> +{
> + u64 reg;
> + int err;
> +
> + /* 1. Clear MMIO_ERR_CFG. Error interrupts are inhibited until step 6. */
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
> +
> + /* 2. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
> + reg = FIELD_PREP(SDXI_MMIO_ERR_STS_STS_BIT, 1) |
> + FIELD_PREP(SDXI_MMIO_ERR_STS_OVF_BIT, 1) |
> + FIELD_PREP(SDXI_MMIO_ERR_STS_ERR_BIT, 1);
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_STS, reg);
> +
> + /* 3. Allocate memory for the error log ring buffer, initialize to zero. */
> + sdxi->err_log = dma_alloc_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
> + &sdxi->err_log_dma, GFP_KERNEL);
> + if (!sdxi->err_log)
> + return -ENOMEM;
> +
> + /*
> + * 4. Set MMIO_ERR_CTL.intr_en to 1 if interrupts on
> + * context-level errors are desired.
> + */
> + reg = sdxi_read64(sdxi, SDXI_MMIO_ERR_CTL);
> + FIELD_MODIFY(SDXI_MMIO_ERR_CTL_EN, ®, 1);
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CTL, reg);
> +
> + /*
> + * The spec is not explicit about when to do this, but this
> + * seems like the right time: enable interrupt on
> + * function-level transition to error state.
> + */
> + reg = sdxi_read64(sdxi, SDXI_MMIO_CTL0);
> + FIELD_MODIFY(SDXI_MMIO_CTL0_FN_ERR_INTR_EN, ®, 1);
> + sdxi_write64(sdxi, SDXI_MMIO_CTL0, reg);
> +
> + /* 5. Clear MMIO_ERR_WRT and MMIO_ERR_RD. */
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_WRT, 0);
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, 0);
> +
> + /*
> + * Error interrupts can be generated once MMIO_ERR_CFG.en is
> + * set in step 6, so set up the handler now.
> + */
> + err = request_threaded_irq(sdxi->error_irq, NULL, sdxi_irq_thread,
> + IRQF_TRIGGER_NONE, "SDXI error", sdxi);
> + if (err)
> + goto free_errlog;
> +
> + /* 6. Program MMIO_ERR_CFG. */
I'm guessing these are numbers steps in some bit of the spec?
If not some of these comments like this one provide no value. We can
see what is being written from the code! Perhaps add a very specific
spec reference if you want to show why the numbering is here.
> + reg = FIELD_PREP(SDXI_MMIO_ERR_CFG_PTR, sdxi->err_log_dma >> 12) |
> + FIELD_PREP(SDXI_MMIO_ERR_CFG_SZ, ERROR_LOG_ENTRIES >> 6) |
> + FIELD_PREP(SDXI_MMIO_ERR_CFG_EN, 1);
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, reg);
> +
> + return 0;
> +
> +free_errlog:
> + dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
> + sdxi->err_log, sdxi->err_log_dma);
> + return err;
> +}
> +
> +void sdxi_error_exit(struct sdxi_dev *sdxi)
> +{
> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
> + free_irq(sdxi->error_irq, sdxi);
> + dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
> + sdxi->err_log, sdxi->err_log_dma);
> +}
>
Jonathan Cameron <jonathan.cameron@huawei.com> writes:
> On Fri, 05 Sep 2025 13:48:29 -0500
> Nathan Lynch via B4 Relay <devnull+nathan.lynch.amd.com@kernel.org> wrote:
>
>> From: Nathan Lynch <nathan.lynch@amd.com>
>>
>> SDXI implementations provide software with detailed information about
>> error conditions using a per-device ring buffer in system memory. When
>> an error condition is signaled via interrupt, the driver retrieves any
>> pending error log entries and reports them to the kernel log.
>>
>> Co-developed-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Wei Huang <wei.huang2@amd.com>
>> Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
> Hi,
> A few more comments inline. Kind of similar stuff around
> having both register definitions for unpacking and the structure
> definitions in patch 2.
>
> Thanks,
>
> Jonathan
>> ---
>> drivers/dma/sdxi/error.c | 340 +++++++++++++++++++++++++++++++++++++++++++++++
>> drivers/dma/sdxi/error.h | 16 +++
>> 2 files changed, 356 insertions(+)
>>
>> diff --git a/drivers/dma/sdxi/error.c b/drivers/dma/sdxi/error.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..c5e33f5989250352f6b081a3049b3b1f972c85a6
>> --- /dev/null
>> +++ b/drivers/dma/sdxi/error.c
>
>> +/* The "unpacked" counterpart to ERRLOG_HD_ENT. */
>> +struct errlog_entry {
>> + u64 dsc_index;
>> + u16 cxt_num;
>> + u16 err_class;
>> + u16 type;
>> + u8 step;
>> + u8 buf;
>> + u8 sub_step;
>> + u8 re;
>> + bool vl;
>> + bool cv;
>> + bool div;
>> + bool bv;
>> +};
>> +
>> +#define ERRLOG_ENTRY_FIELD(hi_, lo_, name_) \
>> + PACKED_FIELD(hi_, lo_, struct errlog_entry, name_)
>> +#define ERRLOG_ENTRY_FLAG(nr_, name_) \
>> + ERRLOG_ENTRY_FIELD(nr_, nr_, name_)
>> +
>> +/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)" */
>> +static const struct packed_field_u16 errlog_hd_ent_fields[] = {
>> + ERRLOG_ENTRY_FLAG(0, vl),
>> + ERRLOG_ENTRY_FIELD(13, 8, step),
>> + ERRLOG_ENTRY_FIELD(26, 16, type),
>> + ERRLOG_ENTRY_FLAG(32, cv),
>> + ERRLOG_ENTRY_FLAG(33, div),
>> + ERRLOG_ENTRY_FLAG(34, bv),
>> + ERRLOG_ENTRY_FIELD(38, 36, buf),
>> + ERRLOG_ENTRY_FIELD(43, 40, sub_step),
>> + ERRLOG_ENTRY_FIELD(46, 44, re),
>> + ERRLOG_ENTRY_FIELD(63, 48, cxt_num),
>> + ERRLOG_ENTRY_FIELD(127, 64, dsc_index),
>> + ERRLOG_ENTRY_FIELD(367, 352, err_class),
>
> The association between the fields here and struct sdxi_err_log_hd_ent
> to me should be via some defines in patch 2 for the various fields
> embedded in misc0 etc.
>
>> +};
>
>> +static void sdxi_print_err(struct sdxi_dev *sdxi, u64 err_rd)
>> +{
>> + struct errlog_entry ent;
>> + size_t index;
>> +
>> + index = err_rd % ERROR_LOG_ENTRIES;
>> +
>> + unpack_fields(&sdxi->err_log[index], sizeof(sdxi->err_log[0]),
>> + &ent, errlog_hd_ent_fields, SDXI_PACKING_QUIRKS);
>> +
>> + if (!ent.vl) {
>> + dev_err_ratelimited(sdxi_to_dev(sdxi),
>> + "Ignoring error log entry with vl=0\n");
>> + return;
>> + }
>> +
>> + if (ent.type != OP_TYPE_ERRLOG) {
>> + dev_err_ratelimited(sdxi_to_dev(sdxi),
>> + "Ignoring error log entry with type=%#x\n",
>> + ent.type);
>> + return;
>> + }
>> +
>> + sdxi_err(sdxi, "error log entry[%zu], MMIO_ERR_RD=%#llx:\n",
>> + index, err_rd);
>> + sdxi_err(sdxi, " re: %#x (%s)\n", ent.re, reaction_str(ent.re));
>> + sdxi_err(sdxi, " step: %#x (%s)\n", ent.step, step_str(ent.step));
>> + sdxi_err(sdxi, " sub_step: %#x (%s)\n",
>> + ent.sub_step, sub_step_str(ent.sub_step));
>> + sdxi_err(sdxi, " cv: %u div: %u bv: %u\n", ent.cv, ent.div, ent.bv);
>> + if (ent.bv)
>> + sdxi_err(sdxi, " buf: %u\n", ent.buf);
>> + if (ent.cv)
>> + sdxi_err(sdxi, " cxt_num: %#x\n", ent.cxt_num);
>> + if (ent.div)
>> + sdxi_err(sdxi, " dsc_index: %#llx\n", ent.dsc_index);
>> + sdxi_err(sdxi, " err_class: %#x\n", ent.err_class);
> Consider using tracepoints for error logging rather than large splats
> in the log.
Agreed, context-level errors (which will be user-triggerable once there
is an ABI exposed to user space) should not be dumped to the kernel log
by default.
Some function-level errors may be appropriate to print.
> I'd then just fill the tracepoint in directly rather than have an
> unpacking step.
Yes, I can do that.
>
>> +}
>
>> +/* Refer to "Error Log Initialization" */
>> +int sdxi_error_init(struct sdxi_dev *sdxi)
>> +{
>> + u64 reg;
>> + int err;
>> +
>> + /* 1. Clear MMIO_ERR_CFG. Error interrupts are inhibited until step 6. */
>> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
>> +
>> + /* 2. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
>> + reg = FIELD_PREP(SDXI_MMIO_ERR_STS_STS_BIT, 1) |
>> + FIELD_PREP(SDXI_MMIO_ERR_STS_OVF_BIT, 1) |
>> + FIELD_PREP(SDXI_MMIO_ERR_STS_ERR_BIT, 1);
>> + sdxi_write64(sdxi, SDXI_MMIO_ERR_STS, reg);
>> +
>> + /* 3. Allocate memory for the error log ring buffer, initialize to zero. */
>> + sdxi->err_log = dma_alloc_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
>> + &sdxi->err_log_dma, GFP_KERNEL);
>> + if (!sdxi->err_log)
>> + return -ENOMEM;
>> +
>> + /*
>> + * 4. Set MMIO_ERR_CTL.intr_en to 1 if interrupts on
>> + * context-level errors are desired.
>> + */
>> + reg = sdxi_read64(sdxi, SDXI_MMIO_ERR_CTL);
>> + FIELD_MODIFY(SDXI_MMIO_ERR_CTL_EN, ®, 1);
>> + sdxi_write64(sdxi, SDXI_MMIO_ERR_CTL, reg);
>> +
>> + /*
>> + * The spec is not explicit about when to do this, but this
>> + * seems like the right time: enable interrupt on
>> + * function-level transition to error state.
>> + */
>> + reg = sdxi_read64(sdxi, SDXI_MMIO_CTL0);
>> + FIELD_MODIFY(SDXI_MMIO_CTL0_FN_ERR_INTR_EN, ®, 1);
>> + sdxi_write64(sdxi, SDXI_MMIO_CTL0, reg);
>> +
>> + /* 5. Clear MMIO_ERR_WRT and MMIO_ERR_RD. */
>> + sdxi_write64(sdxi, SDXI_MMIO_ERR_WRT, 0);
>> + sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, 0);
>> +
>> + /*
>> + * Error interrupts can be generated once MMIO_ERR_CFG.en is
>> + * set in step 6, so set up the handler now.
>> + */
>> + err = request_threaded_irq(sdxi->error_irq, NULL, sdxi_irq_thread,
>> + IRQF_TRIGGER_NONE, "SDXI error", sdxi);
>> + if (err)
>> + goto free_errlog;
>> +
>> + /* 6. Program MMIO_ERR_CFG. */
>
> I'm guessing these are numbers steps in some bit of the spec?
> If not some of these comments like this one provide no value. We can
> see what is being written from the code! Perhaps add a very specific
> spec reference if you want to show why the numbering is here.
Perhaps it's understated, but at the beginning of this function:
/* Refer to "Error Log Initialization" */
int sdxi_error_init(struct sdxi_dev *sdxi)
The numbered steps in the function correspond to the numbered steps in
that part of the spec.
I could make the comment something like:
/*
* The numbered steps below correspond to the sequence outlined in 3.4.2
* "Error Log Initialization".
*/
though I'm unsure how stable the section numbering in the SDXI spec will
be over time.
> >> + > >> + /* 6. Program MMIO_ERR_CFG. */ > > > > I'm guessing these are numbers steps in some bit of the spec? > > If not some of these comments like this one provide no value. We can > > see what is being written from the code! Perhaps add a very specific > > spec reference if you want to show why the numbering is here. > > Perhaps it's understated, but at the beginning of this function: > > /* Refer to "Error Log Initialization" */ > int sdxi_error_init(struct sdxi_dev *sdxi) > > The numbered steps in the function correspond to the numbered steps in > that part of the spec. > > I could make the comment something like: > > /* > * The numbered steps below correspond to the sequence outlined in 3.4.2 > * "Error Log Initialization". > */ > > though I'm unsure how stable the section numbering in the SDXI spec will > be over time. Always reference sections by name (which you do!) and version of the spec for alongside the section number. They are rarely stable for long. Jonathan
© 2016 - 2026 Red Hat, Inc.