[PATCH v2 04/11] stop_machine: Add NMI-based execution path

Chang S. Bae posted 11 patches 1 day, 23 hours ago
[PATCH v2 04/11] stop_machine: Add NMI-based execution path
Posted by Chang S. Bae 1 day, 23 hours ago
Currently multi_cpu_stop() executes the stop function in the
MULTI_STOP_RUN state. But NMIs can still preempt the execution. For use
cases requiring stricter isolation from them, provide an option to run
the stop function from NMI context.

Then, the NMI stop function must be built without instrumentation, as
exceptions may re-enable NMIs on return via IRET. Annotate the NMI
handler entirely with noinstr.

However, objtool cannot reliably determine whether the indirect call
target is located in a noinstr section, so it may emit false positives.
To avoid this, temporarily lift the instrumentation restriction around
the indirect call site and document the intention.

The x86 microcode loader is currently the primary user for this. But
other architectures are not expected to use it. Add a build option to
make it opt-in.

Originally-by: David Kaplan <david.kaplan@amd.com>
Suggested-by: Borislav Petkov <bp@alien8.de>
Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Link: https://lore.kernel.org/lkml/20260129121729.GRaXtP2aeWkQKegxC2@fat_crate.local
Link: https://lore.kernel.org/lkml/87wm0zl8p2.ffs@tglx
---
V1 -> V2: Multiple changes
* Fix racy return value read [**]
* Add Kconfig option (Thomas)
* Add cpumask to track NMI delivery (Boris)
* Rework implementation. Consolidate under ifdef

[**]: Observed delay in IPI/NMI delivery when testing error return paths
---
 arch/Kconfig                 |  3 ++
 include/linux/stop_machine.h | 16 ++++++
 kernel/stop_machine.c        | 96 +++++++++++++++++++++++++++++++++++-
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 102ddbd4298e..f84fd528aae7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1841,4 +1841,7 @@ config ARCH_WANTS_PRE_LINK_VMLINUX
 config ARCH_HAS_CPU_ATTACK_VECTORS
 	bool
 
+config STOP_MACHINE_NMI
+	bool
+
 endmenu
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 2f986555113a..9424d363ab38 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -19,6 +19,12 @@
  */
 typedef int (*cpu_stop_fn_t)(void *arg);
 
+/*
+ * Stop function variant runnable from NMI context. This makes the
+ * noinstr requirement explicit at the type level.
+ */
+typedef int (*cpu_stop_nmisafe_fn_t)(void *arg);
+
 #ifdef CONFIG_SMP
 
 struct cpu_stop_work {
@@ -189,4 +195,14 @@ stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 }
 
 #endif	/* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+
+void arch_send_self_nmi(void);
+bool noinstr stop_machine_nmi_handler(void);
+
+#else
+static inline bool stop_machine_nmi_handler(void) { return false; }
+#endif /* CONFIG_STOP_MACHINE_NMI */
+
 #endif	/* _LINUX_STOP_MACHINE */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 092c65c002ff..45ea62f1b2b5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -22,6 +22,7 @@
 #include <linux/atomic.h>
 #include <linux/nmi.h>
 #include <linux/sched/wake_q.h>
+#include <linux/delay.h>
 
 /*
  * Structure to determine completion condition and record errors.  May
@@ -174,6 +175,15 @@ struct multi_stop_data {
 
 	enum multi_stop_state	state;
 	atomic_t		thread_ack;
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+	/* Used in the NMI stop_machine variant */
+	bool			use_nmi;
+	/* A separate function type to highlight noinstr requirement */
+	cpu_stop_nmisafe_fn_t	nmisafe_fn;
+	/* cpumask of CPUs on which to raise an NMI */
+	cpumask_var_t		nmi_cpus;
+#endif
 };
 
 static void set_state(struct multi_stop_data *msdata,
@@ -197,6 +207,8 @@ notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
 	cpu_relax();
 }
 
+static int multi_stop_run(struct multi_stop_data *msdata);
+
 /* This is the cpu_stop function which stops the CPU. */
 static int multi_cpu_stop(void *data)
 {
@@ -235,7 +247,7 @@ static int multi_cpu_stop(void *data)
 				break;
 			case MULTI_STOP_RUN:
 				if (is_active)
-					err = msdata->fn(msdata->data);
+					err = multi_stop_run(msdata);
 				break;
 			default:
 				break;
@@ -712,3 +724,85 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 	mutex_unlock(&stop_cpus_mutex);
 	return ret | done.ret;
 }
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+
+struct nmi_stop {
+	struct multi_stop_data	*data;
+	int			ret;
+	bool			done;
+};
+
+static DEFINE_PER_CPU(struct nmi_stop, nmi_stop);
+
+/*
+ * Instrumentation may trigger nested exceptions such as #INT3, #DB,
+ * or #PF. IRET from those would re-enable NMIs, which opposes the goal
+ * of this NMI stop-machine facility.
+ */
+bool noinstr stop_machine_nmi_handler(void)
+{
+	struct multi_stop_data *msdata = raw_cpu_read(nmi_stop.data);
+	unsigned int cpu = smp_processor_id();
+	int ret;
+
+	if (!msdata || !cpumask_test_and_clear_cpu(cpu, msdata->nmi_cpus))
+		return false;
+
+	/*
+	 * The indirect call to @nmisafe_fn() is indistinguishable at
+	 * post-compilation. Temporarily enabling instrumentation avoids
+	 * objtool false positives.
+	 */
+	instrumentation_begin();
+	ret = msdata->nmisafe_fn(msdata->data);
+	instrumentation_end();
+
+	raw_cpu_write(nmi_stop.ret,  ret);
+	raw_cpu_write(nmi_stop.done, true);
+	raw_cpu_write(nmi_stop.data, NULL);
+
+	return true;
+}
+
+static bool wait_for_nmi_handler(void)
+{
+	/* Conservative timeout */
+	unsigned long timeout = USEC_PER_SEC;
+
+	while (!this_cpu_read(nmi_stop.done) && timeout--)
+		udelay(1);
+
+	return this_cpu_read(nmi_stop.done);
+}
+
+static int nmi_stop_run(struct multi_stop_data *msdata)
+{
+	/*
+	 * Save per-CPU state accessible from NMI context and raise a
+	 * self-NMI to execute the stop function from the NMI handler
+	 */
+	this_cpu_write(nmi_stop.data, msdata);
+	this_cpu_write(nmi_stop.done, false);
+	arch_send_self_nmi();
+
+	/* Ensure the handler went through before reading the result */
+	if (!wait_for_nmi_handler())
+		return -ETIMEDOUT;
+
+	return this_cpu_read(nmi_stop.ret);
+}
+
+static int multi_stop_run(struct multi_stop_data *msdata)
+{
+	return msdata->use_nmi ? nmi_stop_run(msdata) : msdata->fn(msdata->data);
+}
+
+#else
+
+static int multi_stop_run(struct multi_stop_data *msdata)
+{
+	return msdata->fn(msdata->data);
+}
+
+#endif /* CONFIG_STOP_MACHINE_NMI */
-- 
2.51.0
Re: [PATCH v2 04/11] stop_machine: Add NMI-based execution path
Posted by Chang S. Bae 22 hours ago
Like others, I also checked the review bot:

  https://sashiko.dev/#/patchset/20260331014251.86353-1-chang.seok.bae@intel.com

I thought all of comments could be converged in this patch. My take is 
below.

On 3/30/2026 6:42 PM, Chang S. Bae wrote:
> 
> +struct nmi_stop {
> +	struct multi_stop_data	*data;
> +	int			ret;
> +	bool			done;

The intention was to make it clear at the waiting loop. But ->data == 
NULL check can substitute it and a single variable looks to make it more 
robust and simple.

> +bool noinstr stop_machine_nmi_handler(void)
> +{
> +	struct multi_stop_data *msdata = raw_cpu_read(nmi_stop.data);
> +	unsigned int cpu = smp_processor_id();
> +	int ret;
> +
> +	if (!msdata || !cpumask_test_and_clear_cpu(cpu, msdata->nmi_cpus))
> +		return false;

smp_processor_id() and cpumask_test_and_clear_cpu() are wrappers that 
could include instrumentation, so not suitable here. Instead, 
raw_smp_processor_id() and arch_test_and_clear_bit() are inner functions.


> +	/* Ensure the handler went through before reading the result */
> +	if (!wait_for_nmi_handler())
> +		return -ETIMEDOUT;

On error exit, the stop_data pointer should be cleaned up as well before 
it is freed later.


Attached is the diff addressing them.diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 45ea62f1b2b5..5bd1105d1a11 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -730,7 +730,6 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 struct nmi_stop {
 	struct multi_stop_data	*data;
 	int			ret;
-	bool			done;
 };
 
 static DEFINE_PER_CPU(struct nmi_stop, nmi_stop);
@@ -743,10 +742,10 @@ static DEFINE_PER_CPU(struct nmi_stop, nmi_stop);
 bool noinstr stop_machine_nmi_handler(void)
 {
 	struct multi_stop_data *msdata = raw_cpu_read(nmi_stop.data);
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu = raw_smp_processor_id();
 	int ret;
 
-	if (!msdata || !cpumask_test_and_clear_cpu(cpu, msdata->nmi_cpus))
+	if (!msdata || !arch_test_and_clear_bit(cpu, cpumask_bits(msdata->nmi_cpus)))
 		return false;
 
 	/*
@@ -759,7 +758,6 @@ bool noinstr stop_machine_nmi_handler(void)
 	instrumentation_end();
 
 	raw_cpu_write(nmi_stop.ret,  ret);
-	raw_cpu_write(nmi_stop.done, true);
 	raw_cpu_write(nmi_stop.data, NULL);
 
 	return true;
@@ -770,10 +768,11 @@ static bool wait_for_nmi_handler(void)
 	/* Conservative timeout */
 	unsigned long timeout = USEC_PER_SEC;
 
-	while (!this_cpu_read(nmi_stop.done) && timeout--)
+	/* The handler clears up at the end */
+	while (this_cpu_read(nmi_stop.data) && timeout--)
 		udelay(1);
 
-	return this_cpu_read(nmi_stop.done);
+	return !this_cpu_read(nmi_stop.data);
 }
 
 static int nmi_stop_run(struct multi_stop_data *msdata)
@@ -783,12 +782,16 @@ static int nmi_stop_run(struct multi_stop_data *msdata)
 	 * self-NMI to execute the stop function from the NMI handler
 	 */
 	this_cpu_write(nmi_stop.data, msdata);
-	this_cpu_write(nmi_stop.done, false);
 	arch_send_self_nmi();
 
-	/* Ensure the handler went through before reading the result */
-	if (!wait_for_nmi_handler())
+	/*
+	 * Ensure the handler went through before reading the result.
+	 * Otherwise, make no stale state left behind.
+	 */
+	if (!wait_for_nmi_handler()) {
+		this_cpu_write(nmi_stop.data, NULL);
 		return -ETIMEDOUT;
+	}
 
 	return this_cpu_read(nmi_stop.ret);
 }