[v3] Add NMI Support to RISC-V via SSE

[PATCH v3 8/8] drivers: firmware: riscv: add unknown nmi support

Posted by Yunhui Cui 2 months, 2 weeks ago

Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI
event. When the system becomes unresponsive, unknown_nmi_handler()
can be manually triggered, which in turn invokes nmi_panic() to
collect vmcore for root cause analysis.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 arch/riscv/include/asm/sbi.h           |  1 +
 drivers/firmware/riscv/riscv_sse_nmi.c | 68 ++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 874cc1d7603a5..52d3fdf2d4cc1 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -486,6 +486,7 @@ enum sbi_sse_attr_id {
 #define SBI_SSE_EVENT_LOCAL_LOW_PRIO_RAS	0x00100000
 #define SBI_SSE_EVENT_GLOBAL_LOW_PRIO_RAS	0x00108000
 #define SBI_SSE_EVENT_LOCAL_SOFTWARE_INJECTED	0xffff0000
+#define SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI		0xffff0001
 #define SBI_SSE_EVENT_GLOBAL_SOFTWARE_INJECTED	0xffff8000
 
 #define SBI_SSE_EVENT_PLATFORM		BIT(14)
diff --git a/drivers/firmware/riscv/riscv_sse_nmi.c b/drivers/firmware/riscv/riscv_sse_nmi.c
index 85aa65f31943b..d98015d1cb893 100644
--- a/drivers/firmware/riscv/riscv_sse_nmi.c
+++ b/drivers/firmware/riscv/riscv_sse_nmi.c
@@ -7,6 +7,7 @@
 #include <linux/nmi.h>
 #include <linux/riscv_sbi_sse.h>
 #include <linux/riscv_sse_nmi.h>
+#include <linux/sysctl.h>
 
 #include <asm/irq_regs.h>
 #include <asm/sbi.h>
@@ -16,7 +17,10 @@
 	do { if (type & (mask)) func(__VA_ARGS__); } while (0)
 
 static bool nmi_available;
+static int unknown_nmi_panic;
 static struct sse_event *local_nmi_evt;
+static struct sse_event *unknown_nmi_evt;
+static struct ctl_table_header *unknown_nmi_sysctl_header;
 static DEFINE_PER_CPU(atomic_t, local_nmi) = ATOMIC_INIT(LOCAL_NMI_NONE);
 
 bool nmi_support(void)
@@ -52,6 +56,35 @@ void send_nmi_mask(cpumask_t *mask, enum local_nmi_type type)
 		send_nmi_single(cpu, type);
 }
 
+static int __init setup_unknown_nmi_panic(char *str)
+{
+	unknown_nmi_panic = 1;
+	return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+static const struct ctl_table unknown_nmi_table[] = {
+	{
+		.procname       = "unknown_nmi_panic",
+		.data           = &unknown_nmi_panic,
+		.maxlen         = sizeof(bool),
+		.mode           = 0644,
+		.proc_handler   = proc_dobool,
+	},
+};
+
+static int unknown_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
+{
+	pr_emerg("NMI received for unknown on CPU %d.\n", smp_processor_id());
+
+	if (unknown_nmi_panic)
+		nmi_panic(regs, "NMI: Not continuing");
+
+	pr_emerg("Dazed and confused, but trying to continue\n");
+
+	return 0;
+}
+
 static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
 {
 	enum local_nmi_type type;
@@ -69,6 +102,35 @@ static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
 	return 0;
 }
 
+static int unknown_nmi_init(void)
+{
+	int ret;
+
+	unknown_nmi_evt = sse_event_register(SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI, 0,
+					     unknown_nmi_handler, NULL);
+	if (IS_ERR(unknown_nmi_evt))
+		return PTR_ERR(unknown_nmi_evt);
+
+	ret = sse_event_enable(unknown_nmi_evt);
+	if (ret)
+		goto err_unregister;
+
+	unknown_nmi_sysctl_header = register_sysctl("kernel", unknown_nmi_table);
+	if (!unknown_nmi_sysctl_header) {
+		ret = -ENOMEM;
+		goto err_disable;
+	}
+
+	pr_info("Using SSE for unknown NMI event delivery\n");
+	return 0;
+
+err_disable:
+	sse_event_disable(unknown_nmi_evt);
+err_unregister:
+	sse_event_unregister(unknown_nmi_evt);
+	return ret;
+}
+
 static int __init local_nmi_init(void)
 {
 	int ret;
@@ -101,6 +163,12 @@ static int __init sse_nmi_init(void)
 
 	WRITE_ONCE(nmi_available, true);
 
+	ret = unknown_nmi_init();
+	if (ret) {
+		pr_err("Unknown_nmi_init failed with error %d\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.39.5

Re: [PATCH v3 8/8] drivers: firmware: riscv: add unknown nmi support

Posted by Radim Krčmář 2 months, 1 week ago

2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>:
> Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI
> event. When the system becomes unresponsive, unknown_nmi_handler()
> can be manually triggered, which in turn invokes nmi_panic() to
> collect vmcore for root cause analysis.

Is UNKNOWN_NMI what we expect the watchdog to send?

Thanks.

Re: [External] Re: [PATCH v3 8/8] drivers: firmware: riscv: add unknown nmi support

Posted by yunhui cui 2 months, 1 week ago

Hi Radim,

On Thu, Dec 4, 2025 at 12:11 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>
> 2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>:
> > Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI
> > event. When the system becomes unresponsive, unknown_nmi_handler()
> > can be manually triggered, which in turn invokes nmi_panic() to
> > collect vmcore for root cause analysis.
>
> Is UNKNOWN_NMI what we expect the watchdog to send?

For reference: As stated in
https://github.com/riscv-non-isa/riscv-sbi-doc/pull/223, "Generally,
an external interrupt is used as an Unknown NMI pin, and an Unknown
NMI event is sent to the SBI firmware by triggering this pin. Then the
SBI firmware will send SBI_SSE_EVENT_GLOBAL_UNKNOWN_NMI to the
kernel."

When the Linux system is unresponsive, we can manually trigger it via
BMC (ipmitool).

>
> Thanks.

Thanks,
Yunhui

Re: [External] Re: [PATCH v3 8/8] drivers: firmware: riscv: add unknown nmi support

Posted by Radim Krčmář 2 months, 1 week ago

2025-12-04T13:18:00+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> Hi Radim,
>
> On Thu, Dec 4, 2025 at 12:11 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>>
>> 2025-11-27T20:53:05+08:00, Yunhui Cui <cuiyunhui@bytedance.com>:
>> > Register unknown_nmi_handler() as the handler for the UNKNOWN_NMI
>> > event. When the system becomes unresponsive, unknown_nmi_handler()
>> > can be manually triggered, which in turn invokes nmi_panic() to
>> > collect vmcore for root cause analysis.
>>
>> Is UNKNOWN_NMI what we expect the watchdog to send?
>
> For reference: As stated in
> https://github.com/riscv-non-isa/riscv-sbi-doc/pull/223, "Generally,
> an external interrupt is used as an Unknown NMI pin, and an Unknown
> NMI event is sent to the SBI firmware by triggering this pin. Then the
> SBI firmware will send SBI_SSE_EVENT_GLOBAL_UNKNOWN_NMI to the
> kernel."
>
> When the Linux system is unresponsive, we can manually trigger it via
> BMC (ipmitool).

Makes sense, thanks, and do we plan to deal with other crash sources?

For example if a watchdog/sysrq triggers a crash, and then gets
interrupted with UNKNOWN_NMI.

[PATCH v3 1/8] drivers: firmware: riscv: add SSE NMI support
[PATCH v3 2/8] riscv: smp: move ipi_cpu_crash_stop() declaration to smp.h
[PATCH v3 3/8] smp: move num_other_online_cpus() into smp.h
[PATCH v3 4/8] riscv: smp: use NMI for crash stop
[PATCH v3 5/8] riscv: smp: use NMI for CPU stop
[PATCH v3 6/8] riscv: smp: use NMI for backtrace
[PATCH v3 7/8] riscv: smp: kgdb: use NMI for CPU roundup
[PATCH v3 8/8] drivers: firmware: riscv: add unknown nmi support