[PATCH v2 4/8] riscv: smp: use NMI for crash stop

Yunhui Cui posted 8 patches 1 week, 6 days ago
There is a newer version of this series
[PATCH v2 4/8] riscv: smp: use NMI for crash stop
Posted by Yunhui Cui 1 week, 6 days ago
Use NMI instead of IPI for crash stop if RISC-V SSE NMI is supported.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 arch/riscv/kernel/smp.c                | 14 +++++++++++++-
 drivers/firmware/riscv/riscv_sse_nmi.c | 10 ++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 669325e68a21a..07ccc28f52172 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -16,6 +16,7 @@
 #include <linux/kgdb.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
+#include <linux/riscv_sse_nmi.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
@@ -300,7 +301,18 @@ void crash_smp_send_stop(void)
 	atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
 
 	pr_crit("SMP: stopping secondary CPUs\n");
-	send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
+
+	/*
+	 * IPI performs better than NMI, but attempting IPI first and
+	 * falling back to NMI on failure requires recording CPUs that failed
+	 * to stop. This adds complexity to cpu_crash_stop(). Since this operation
+	 * is rare and typically in the final phase, directly replace IPI
+	 * with NMI.
+	 */
+	if (!nmi_support())
+		send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
+	else
+		send_nmi_mask(&mask, LOCAL_NMI_CRASH);
 
 	/* Wait up to one second for other CPUs to stop */
 	timeout = USEC_PER_SEC;
diff --git a/drivers/firmware/riscv/riscv_sse_nmi.c b/drivers/firmware/riscv/riscv_sse_nmi.c
index 1763f43961ab6..e4c20dce40f9a 100644
--- a/drivers/firmware/riscv/riscv_sse_nmi.c
+++ b/drivers/firmware/riscv/riscv_sse_nmi.c
@@ -10,6 +10,9 @@
 #include <asm/sbi.h>
 #include <asm/smp.h>
 
+#define NMI_HANDLE(mask, func, ...) \
+	do { if (type & (mask)) func(__VA_ARGS__); } while (0)
+
 bool nmi_available;
 static struct sse_event *local_nmi_evt;
 static atomic_t local_nmi_arg = ATOMIC_INIT(LOCAL_NMI_NONE);
@@ -48,6 +51,13 @@ void send_nmi_mask(cpumask_t *mask, enum local_nmi_type type)
 
 static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
 {
+	enum local_nmi_type type = atomic_read((atomic_t *)arg);
+	unsigned int cpu = smp_processor_id();
+
+	NMI_HANDLE(LOCAL_NMI_CRASH, cpu_crash_stop, cpu, regs);
+
+	atomic_set(&local_nmi_arg, LOCAL_NMI_NONE);
+
 	return 0;
 }
 
-- 
2.39.5
Re: [PATCH v2 4/8] riscv: smp: use NMI for crash stop
Posted by yunhui cui 4 days, 16 hours ago
Hi All,

On Tue, Nov 18, 2025 at 10:51 AM Yunhui Cui <cuiyunhui@bytedance.com> wrote:
>
> Use NMI instead of IPI for crash stop if RISC-V SSE NMI is supported.
>
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> ---
>  arch/riscv/kernel/smp.c                | 14 +++++++++++++-
>  drivers/firmware/riscv/riscv_sse_nmi.c | 10 ++++++++++
>  2 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
> index 669325e68a21a..07ccc28f52172 100644
> --- a/arch/riscv/kernel/smp.c
> +++ b/arch/riscv/kernel/smp.c
> @@ -16,6 +16,7 @@
>  #include <linux/kgdb.h>
>  #include <linux/percpu.h>
>  #include <linux/profile.h>
> +#include <linux/riscv_sse_nmi.h>
>  #include <linux/smp.h>
>  #include <linux/sched.h>
>  #include <linux/seq_file.h>
> @@ -300,7 +301,18 @@ void crash_smp_send_stop(void)
>         atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
>
>         pr_crit("SMP: stopping secondary CPUs\n");
> -       send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> +
> +       /*
> +        * IPI performs better than NMI, but attempting IPI first and
> +        * falling back to NMI on failure requires recording CPUs that failed
> +        * to stop. This adds complexity to cpu_crash_stop(). Since this operation
> +        * is rare and typically in the final phase, directly replace IPI
> +        * with NMI.
> +        */
> +       if (!nmi_support())
> +               send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> +       else
> +               send_nmi_mask(&mask, LOCAL_NMI_CRASH);
>
>         /* Wait up to one second for other CPUs to stop */
>         timeout = USEC_PER_SEC;
> diff --git a/drivers/firmware/riscv/riscv_sse_nmi.c b/drivers/firmware/riscv/riscv_sse_nmi.c
> index 1763f43961ab6..e4c20dce40f9a 100644
> --- a/drivers/firmware/riscv/riscv_sse_nmi.c
> +++ b/drivers/firmware/riscv/riscv_sse_nmi.c
> @@ -10,6 +10,9 @@
>  #include <asm/sbi.h>
>  #include <asm/smp.h>
>
> +#define NMI_HANDLE(mask, func, ...) \
> +       do { if (type & (mask)) func(__VA_ARGS__); } while (0)
> +
>  bool nmi_available;
>  static struct sse_event *local_nmi_evt;
>  static atomic_t local_nmi_arg = ATOMIC_INIT(LOCAL_NMI_NONE);
> @@ -48,6 +51,13 @@ void send_nmi_mask(cpumask_t *mask, enum local_nmi_type type)
>
>  static int local_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
>  {
> +       enum local_nmi_type type = atomic_read((atomic_t *)arg);
> +       unsigned int cpu = smp_processor_id();
> +
> +       NMI_HANDLE(LOCAL_NMI_CRASH, cpu_crash_stop, cpu, regs);
> +
> +       atomic_set(&local_nmi_arg, LOCAL_NMI_NONE);

Do not perform direct zero clearing on local_nmi_arg; instead, only
clear the processed bits. Otherwise, it may miss responses to some NMI
types.

> +
>         return 0;
>  }
>
> --
> 2.39.5
>

Thanks,
Yunhui