Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI
event. When the system becomes unresponsive, unknown_nmi_handler()
can be manually triggered, which in turn invokes nmi_panic() to
collect vmcore for root cause analysis.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/sbi.h | 1 +
drivers/firmware/riscv/riscv_sse_nmi.c | 68 ++++++++++++++++++++++++++
2 files changed, 69 insertions(+)
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 874cc1d7603a5..52d3fdf2d4cc1 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -486,6 +486,7 @@ enum sbi_sse_attr_id {
#define SBI_SSE_EVENT_LOCAL_LOW_PRIO_RAS 0x00100000
#define SBI_SSE_EVENT_GLOBAL_LOW_PRIO_RAS 0x00108000
#define SBI_SSE_EVENT_LOCAL_SOFTWARE_INJECTED 0xffff0000
+#define SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI 0xffff0001
#define SBI_SSE_EVENT_GLOBAL_SOFTWARE_INJECTED 0xffff8000
#define SBI_SSE_EVENT_PLATFORM BIT(14)
diff --git a/drivers/firmware/riscv/riscv_sse_nmi.c b/drivers/firmware/riscv/riscv_sse_nmi.c
index 85aa65f31943b..d98015d1cb893 100644
--- a/drivers/firmware/riscv/riscv_sse_nmi.c
+++ b/drivers/firmware/riscv/riscv_sse_nmi.c
@@ -7,6 +7,7 @@
#include <linux/nmi.h>
#include <linux/riscv_sbi_sse.h>
#include <linux/riscv_sse_nmi.h>
+#include <linux/sysctl.h>
#include <asm/irq_regs.h>
#include <asm/sbi.h>
@@ -16,7 +17,10 @@
do { if (type & (mask)) func(__VA_ARGS__); } while (0)
static bool nmi_available;
+static int unknown_nmi_panic;
static struct sse_event *local_nmi_evt;
+static struct sse_event *unknown_nmi_evt;
+static struct ctl_table_header *unknown_nmi_sysctl_header;
static DEFINE_PER_CPU(atomic_t, local_nmi) = ATOMIC_INIT(LOCAL_NMI_NONE);
bool nmi_support(void)
@@ -52,6 +56,35 @@ void send_nmi_mask(cpumask_t *mask, enum local_nmi_type type)
send_nmi_single(cpu, type);
}
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+static const struct ctl_table unknown_nmi_table[] = {
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof(bool),
+ .mode = 0644,
+ .proc_handler = proc_dobool,
+ },
+};
+
+static int unknown_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
+{
+ pr_emerg("NMI received for unknown on CPU %d.\n", smp_processor_id());
+
+ if (unknown_nmi_panic)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+
+ return 0;
+}
+
static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
{
enum local_nmi_type type;
@@ -69,6 +102,35 @@ static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
return 0;
}
+static int unknown_nmi_init(void)
+{
+ int ret;
+
+ unknown_nmi_evt = sse_event_register(SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI, 0,
+ unknown_nmi_handler, NULL);
+ if (IS_ERR(unknown_nmi_evt))
+ return PTR_ERR(unknown_nmi_evt);
+
+ ret = sse_event_enable(unknown_nmi_evt);
+ if (ret)
+ goto err_unregister;
+
+ unknown_nmi_sysctl_header = register_sysctl("kernel", unknown_nmi_table);
+ if (!unknown_nmi_sysctl_header) {
+ ret = -ENOMEM;
+ goto err_disable;
+ }
+
+ pr_info("Using SSE for unknown NMI event delivery\n");
+ return 0;
+
+err_disable:
+ sse_event_disable(unknown_nmi_evt);
+err_unregister:
+ sse_event_unregister(unknown_nmi_evt);
+ return ret;
+}
+
static int __init local_nmi_init(void)
{
int ret;
@@ -101,6 +163,12 @@ static int __init sse_nmi_init(void)
WRITE_ONCE(nmi_available, true);
+ ret = unknown_nmi_init();
+ if (ret) {
+ pr_err("Unknown_nmi_init failed with error %d\n", ret);
+ return ret;
+ }
+
return 0;
}
--
2.39.5
2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>: > Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI > event. When the system becomes unresponsive, unknown_nmi_handler() > can be manually triggered, which in turn invokes nmi_panic() to > collect vmcore for root cause analysis. Is UNKNOWN_NMI what we expect the watchdog to send? Thanks.
Hi Radim, On Thu, Dec 4, 2025 at 12:11 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote: > > 2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>: > > Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI > > event. When the system becomes unresponsive, unknown_nmi_handler() > > can be manually triggered, which in turn invokes nmi_panic() to > > collect vmcore for root cause analysis. > > Is UNKNOWN_NMI what we expect the watchdog to send? For reference: As stated in https://github.com/riscv-non-isa/riscv-sbi-doc/pull/223, "Generally, an external interrupt is used as an Unknown NMI pin, and an Unknown NMI event is sent to the SBI firmware by triggering this pin. Then the SBI firmware will send SBI_SSE_EVENT_GLOBAL_UNKNOWN_NMI to the kernel." When the Linux system is unresponsive, we can manually trigger it via BMC (ipmitool). > > Thanks. Thanks, Yunhui
2025-12-04T13:18:00+08:00, yunhui cui <cuiyunhui@bytedance.com>: > Hi Radim, > > On Thu, Dec 4, 2025 at 12:11 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote: >> >> 2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>: >> > Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI >> > event. When the system becomes unresponsive, unknown_nmi_handler() >> > can be manually triggered, which in turn invokes nmi_panic() to >> > collect vmcore for root cause analysis. >> >> Is UNKNOWN_NMI what we expect the watchdog to send? > > For reference: As stated in > https://github.com/riscv-non-isa/riscv-sbi-doc/pull/223, "Generally, > an external interrupt is used as an Unknown NMI pin, and an Unknown > NMI event is sent to the SBI firmware by triggering this pin. Then the > SBI firmware will send SBI_SSE_EVENT_GLOBAL_UNKNOWN_NMI to the > kernel." > > When the Linux system is unresponsive, we can manually trigger it via > BMC (ipmitool). Makes sense, thanks, and do we plan to deal with other crash sources? For example if a watchdog/sysrq triggers a crash, and then gets interrupted with UNKNOWN_NMI.
© 2016 - 2026 Red Hat, Inc.