[v17] Refcounted interrupts, SpinLockIrq for rust

[PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Lyude Paul 2 weeks, 2 days ago

From: Joel Fernandes <joelagnelf@nvidia.com>

Move NMI nesting tracking from the preempt_count bits to a separate per-CPU
counter (nmi_nesting). This is to free up the NMI bits in the preempt_count,
allowing those bits to be repurposed for other uses.  This also has the benefit
of tracking more than 16-levels deep if there is ever a need.

Reduce multiple bits in preempt_count for NMI tracking. Reduce NMI_BITS
from 3 to 1, using it only to detect if we're in an NMI.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Lyude Paul <lyude@redhat.com>
---
 include/linux/hardirq.h | 16 ++++++++++++----
 include/linux/preempt.h | 13 +++++++++----
 kernel/softirq.c        |  2 ++
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d57cab4d4c06f..cc06bda52c3e5 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -10,6 +10,8 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
+DECLARE_PER_CPU(unsigned int, nmi_nesting);
+
 extern void synchronize_irq(unsigned int irq);
 extern bool synchronize_hardirq(unsigned int irq);
 
@@ -102,14 +104,16 @@ void irq_exit_rcu(void);
  */
 
 /*
- * nmi_enter() can nest up to 15 times; see NMI_BITS.
+ * nmi_enter() can nest - nesting is tracked in a per-CPU counter.
  */
 #define __nmi_enter()						\
 	do {							\
 		lockdep_off();					\
 		arch_nmi_enter();				\
-		BUG_ON(in_nmi() == NMI_MASK);			\
-		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		BUG_ON(__this_cpu_read(nmi_nesting) == UINT_MAX);	\
+		__this_cpu_inc(nmi_nesting);			\
+		__preempt_count_add(HARDIRQ_OFFSET);		\
+		preempt_count_set(preempt_count() | NMI_MASK);	\
 	} while (0)
 
 #define nmi_enter()						\
@@ -124,8 +128,12 @@ void irq_exit_rcu(void);
 
 #define __nmi_exit()						\
 	do {							\
+		unsigned int nesting;				\
 		BUG_ON(!in_nmi());				\
-		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		__preempt_count_sub(HARDIRQ_OFFSET);		\
+		nesting = __this_cpu_dec_return(nmi_nesting);	\
+		if (!nesting)					\
+			__preempt_count_sub(NMI_OFFSET);	\
 		arch_nmi_exit();				\
 		lockdep_on();					\
 	} while (0)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f07e7f37f3ca5..e2d3079d3f5f1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -18,6 +18,8 @@
  * - bits 0-7 are the preemption count (max preemption depth: 256)
  * - bits 8-15 are the softirq count (max # of softirqs: 256)
  * - bits 16-23 are the hardirq disable count (max # of hardirq disable: 256)
+ * - bits 24-27 are the hardirq count (max # of hardirqs: 16)
+ * - bit 28 is the NMI flag (no nesting count, tracked separately)
  *
  * The hardirq count could in theory be the same as the number of
  * interrupts in the system, but we run all interrupt handlers with
@@ -25,18 +27,21 @@
  * there are a few palaeontologic drivers which reenable interrupts in
  * the handler, so we need more than one bit here.
  *
+ * NMI nesting depth is tracked in a separate per-CPU variable
+ * (nmi_nesting) to save bits in preempt_count.
+ *
  *         PREEMPT_MASK:	0x000000ff
  *         SOFTIRQ_MASK:	0x0000ff00
  * HARDIRQ_DISABLE_MASK:	0x00ff0000
- *         HARDIRQ_MASK:	0x07000000
- *             NMI_MASK:	0x38000000
+ *         HARDIRQ_MASK:	0x0f000000
+ *             NMI_MASK:	0x10000000
  * PREEMPT_NEED_RESCHED:	0x80000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
 #define HARDIRQ_DISABLE_BITS	8
-#define HARDIRQ_BITS	3
-#define NMI_BITS	3
+#define HARDIRQ_BITS	4
+#define NMI_BITS	1
 
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 77198911b8dd4..af47ea23aba3b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
 EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
 #endif
 
+DEFINE_PER_CPU(unsigned int, nmi_nesting);
+
 /*
  * SOFTIRQ_OFFSET usage:
  *
-- 
2.52.0

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 3 days, 18 hours ago

On Wed, Jan 21, 2026 at 05:39:05PM -0500, Lyude Paul wrote:

>  #define __nmi_enter()						\
>  	do {							\
>  		lockdep_off();					\
>  		arch_nmi_enter();				\
> -		BUG_ON(in_nmi() == NMI_MASK);			\
> -		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		BUG_ON(__this_cpu_read(nmi_nesting) == UINT_MAX);	\
> +		__this_cpu_inc(nmi_nesting);			\
> +		__preempt_count_add(HARDIRQ_OFFSET);		\
> +		preempt_count_set(preempt_count() | NMI_MASK);	\
>  	} while (0)
>  
>  #define nmi_enter()						\
> @@ -124,8 +128,12 @@ void irq_exit_rcu(void);
>  
>  #define __nmi_exit()						\
>  	do {							\
> +		unsigned int nesting;				\
>  		BUG_ON(!in_nmi());				\
> -		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_sub(HARDIRQ_OFFSET);		\
> +		nesting = __this_cpu_dec_return(nmi_nesting);	\
> +		if (!nesting)					\
> +			__preempt_count_sub(NMI_OFFSET);	\
>  		arch_nmi_exit();				\
>  		lockdep_on();					\
>  	} while (0)

While not wrong like last time; it is pretty awful. 

preempt_count_set() is a cmpxchg() loop.

Would not something like so be better?

#define __nmi_enter()                                           \
        do {                                                    \
+               unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;    \
                lockdep_off();                                  \
                arch_nmi_enter();                               \
-               BUG_ON(in_nmi() == NMI_MASK);                   \
-               __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);       \
+               BUG_ON(__this_cpu_read(nmi_nesting) == ~0U);    \
+               __this_cpu_inc(nmi_nesting);                    \
+               _o -= (preempt_count() & NMI_MASK);             \
+               __preempt_count_add(_o);                        \
        } while (0)


 #define __nmi_exit()                                           \
        do {                                                    \
+               unsigned int _o = HARDIRQ_OFFSET;               \
                BUG_ON(!in_nmi());                              \
-               __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);       \
+               if (!__this_cpu_dec_return(nmi_nesting))        \
+                       _o += NMI_MASK;                         \
+               __preempt_count_sub(_o);                        \
                arch_nmi_exit();                                \
                lockdep_on();                                   \
        } while (0)


But I'm really somewhat sad that 64bit can't do better than this.

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 2 days, 19 hours ago

On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> But I'm really somewhat sad that 64bit can't do better than this.

Here, the below builds and boots (albeit with warnings because printf
format crap sucks).

---
 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/preempt.h | 53 ++++++++++++++++++++++++++++++------------
 arch/x86/kernel/cpu/common.c   |  2 +-
 include/linux/hardirq.h        |  7 +++---
 include/linux/preempt.h        | 52 ++++++++++++++++++++++++++++++++++-------
 init/main.c                    |  2 +-
 kernel/Kconfig.preempt         |  4 ++++
 kernel/sched/core.c            |  8 +++----
 kernel/softirq.c               | 10 +++++++-
 kernel/time/timer.c            |  2 +-
 lib/locking-selftest.c         |  2 +-
 11 files changed, 106 insertions(+), 37 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80527299f859..2bd1972fd4c7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -326,6 +326,7 @@ config X86
 	select USER_STACKTRACE_SUPPORT
 	select HAVE_ARCH_KCSAN			if X86_64
 	select PROC_PID_ARCH_STATUS		if PROC_FS
+	select PREEMPT_LONG			if X86_64
 	select HAVE_ARCH_NODE_DEV_GROUP		if X86_SGX
 	select FUNCTION_ALIGNMENT_16B		if X86_64 || X86_ALIGNMENT_16
 	select FUNCTION_ALIGNMENT_4B
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 578441db09f0..1b54d5555138 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -7,10 +7,19 @@
 
 #include <linux/static_call_types.h>
 
-DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count);
 
-/* We use the MSB mostly because its available */
-#define PREEMPT_NEED_RESCHED	0x80000000
+/*
+ * We use the MSB for PREEMPT_NEED_RESCHED mostly because it is available.
+ */
+
+#ifdef CONFIG_64BIT
+#define PREEMPT_NEED_RESCHED	(~((-1L) >> 1))
+#define __pc_op(op, ...)	raw_cpu_##op##_8(__VA_ARGS__)
+#else
+#define PREEMPT_NEED_RESCHED	(~((-1) >> 1))
+#define __pc_op(op, ...)	raw_cpu_##op##_4(__VA_ARGS__)
+#endif
 
 /*
  * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
@@ -24,18 +33,18 @@ DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
  */
 static __always_inline int preempt_count(void)
 {
-	return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+	return __pc_op(read, __preempt_count) & ~PREEMPT_NEED_RESCHED;
 }
 
-static __always_inline void preempt_count_set(int pc)
+static __always_inline void preempt_count_set(long pc)
 {
 	int old, new;
 
-	old = raw_cpu_read_4(__preempt_count);
+	old = __pc_op(read, __preempt_count);
 	do {
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
+	} while (!__pc_op(try_cmpxchg, __preempt_count, &old, new));
 }
 
 /*
@@ -58,33 +67,45 @@ static __always_inline void preempt_count_set(int pc)
 
 static __always_inline void set_preempt_need_resched(void)
 {
-	raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+	__pc_op(and, __preempt_count, ~PREEMPT_NEED_RESCHED);
 }
 
 static __always_inline void clear_preempt_need_resched(void)
 {
-	raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+	__pc_op(or, __preempt_count, PREEMPT_NEED_RESCHED);
 }
 
 static __always_inline bool test_preempt_need_resched(void)
 {
-	return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+	return !(__pc_op(read, __preempt_count) & PREEMPT_NEED_RESCHED);
 }
 
 /*
  * The various preempt_count add/sub methods
  */
 
-static __always_inline void __preempt_count_add(int val)
+static __always_inline void __preempt_count_add(long val)
 {
-	raw_cpu_add_4(__preempt_count, val);
+	__pc_op(add, __preempt_count, val);
 }
 
-static __always_inline void __preempt_count_sub(int val)
+static __always_inline void __preempt_count_sub(long val)
 {
-	raw_cpu_add_4(__preempt_count, -val);
+	__pc_op(add, __preempt_count, -val);
 }
 
+#ifdef CONFIG_64BIT
+static __always_inline void __preempt_count_nmi_enter(void)
+{
+	__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);
+}
+
+static __always_inline void __preempt_count_nmi_exit(void)
+{
+	__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);
+}
+#endif
+
 /*
  * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
  * a decrement which hits zero means we have no preempt_count and should
@@ -101,7 +122,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
  */
 static __always_inline bool should_resched(int preempt_offset)
 {
-	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+	return unlikely(__pc_op(read, __preempt_count) == preempt_offset);
 }
 
 #ifdef CONFIG_PREEMPTION
@@ -148,4 +169,6 @@ do { \
 
 #endif /* PREEMPTION */
 
+#undef __pc_op
+
 #endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e7ab22fce3b5..9d3602f085c9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2219,7 +2219,7 @@ DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 EXPORT_PER_CPU_SYMBOL(const_current_task);
 
-DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
 DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d57cab4d4c06..77defd9624bf 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -108,15 +108,14 @@ void irq_exit_rcu(void);
 	do {							\
 		lockdep_off();					\
 		arch_nmi_enter();				\
-		BUG_ON(in_nmi() == NMI_MASK);			\
-		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		__preempt_count_nmi_enter();			\
 	} while (0)
 
 #define nmi_enter()						\
 	do {							\
 		__nmi_enter();					\
 		lockdep_hardirq_enter();			\
-		ct_nmi_enter();				\
+		ct_nmi_enter();					\
 		instrumentation_begin();			\
 		ftrace_nmi_enter();				\
 		instrumentation_end();				\
@@ -125,7 +124,7 @@ void irq_exit_rcu(void);
 #define __nmi_exit()						\
 	do {							\
 		BUG_ON(!in_nmi());				\
-		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		__preempt_count_nmi_exit();			\
 		arch_nmi_exit();				\
 		lockdep_on();					\
 	} while (0)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index d964f965c8ff..7617ca97f442 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -17,6 +17,9 @@
  *
  * - bits 0-7 are the preemption count (max preemption depth: 256)
  * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ * - bits 16-23 are the hardirq disable count (max # of hardirq disable: 256)
+ * - bits 24-27 are the hardirq count (max # of hardirqs: 16)
+ * - bit 28 is the NMI flag (no nesting count, tracked separately)
  *
  * The hardirq count could in theory be the same as the number of
  * interrupts in the system, but we run all interrupt handlers with
@@ -24,31 +27,41 @@
  * there are a few palaeontologic drivers which reenable interrupts in
  * the handler, so we need more than one bit here.
  *
- *         PREEMPT_MASK:	0x000000ff
- *         SOFTIRQ_MASK:	0x0000ff00
- *         HARDIRQ_MASK:	0x000f0000
- *             NMI_MASK:	0x00f00000
- * PREEMPT_NEED_RESCHED:	0x80000000
+ * NMI nesting depth is tracked in a separate per-CPU variable
+ * (nmi_nesting) to save bits in preempt_count.
+ *
+ *				32bit		64bit + PREEMPT_LONG
+ *
+ *         PREEMPT_MASK:	0x000000ff	0x00000000000000ff
+ *         SOFTIRQ_MASK:	0x0000ff00	0x000000000000ff00
+ * HARDIRQ_DISABLE_MASK:	0x00ff0000	0x0000000000ff0000
+ *         HARDIRQ_MASK:	0x0f000000	0x000000000f000000
+ *             NMI_MASK:	0x10000000	0x00000000f0000000
+ * PREEMPT_NEED_RESCHED:	0x80000000	0x8000000000000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
+#define HARDIRQ_DISABLE_BITS	8
 #define HARDIRQ_BITS	4
-#define NMI_BITS	4
+#define NMI_BITS	(1 + 3*IS_ENABLED(CONFIG_PREEMPT_LONG))
 
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define HARDIRQ_DISABLE_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define HARDIRQ_SHIFT	(HARDIRQ_DISABLE_SHIFT + HARDIRQ_DISABLE_BITS)
 #define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
 
 #define __IRQ_MASK(x)	((1UL << (x))-1)
 
 #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
 #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_DISABLE_MASK	(__IRQ_MASK(HARDIRQ_DISABLE_BITS) << HARDIRQ_DISABLE_SHIFT)
 #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
 #define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
 
 #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
 #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_DISABLE_OFFSET	(1UL << HARDIRQ_DISABLE_SHIFT)
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
 #define NMI_OFFSET	(1UL << NMI_SHIFT)
 
@@ -105,8 +118,8 @@ static __always_inline unsigned char interrupt_context_level(void)
  * preempt_count() is commonly implemented with READ_ONCE().
  */
 
-#define nmi_count()	(preempt_count() & NMI_MASK)
-#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
+#define nmi_count()		(preempt_count() & NMI_MASK)
+#define hardirq_count()		(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
 # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
 # define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
@@ -132,6 +145,27 @@ static __always_inline unsigned char interrupt_context_level(void)
 # define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
 #endif
 
+#ifndef CONFIG_PREEMPT_LONG
+DECLARE_PER_CPU(unsigned int, nmi_nesting);
+
+#define __preempt_count_nmi_enter()				\
+	do {							\
+		unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;	\
+		__this_cpu_inc(nmi_nesting);			\
+		_o -= (preempt_count() & NMI_MASK);		\
+		__preempt_count_add(_o);			\
+	} while (0)
+
+#define __preempt_count_nmi_exit()				\
+	do {							\
+		unsigned int _o = HARDIRQ_OFFSET;		\
+		if (!__this_cpu_dec_return(nmi_nesting))	\
+			_o += NMI_MASK;				\
+		__preempt_count_sub(_o);			\
+	} while (0)
+
+#endif
+
 /*
  * The following macros are deprecated and should not be used in new code:
  * in_softirq()   - We have BH disabled, or are processing softirqs
diff --git a/init/main.c b/init/main.c
index b84818ad9685..f8f4b78b7a06 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1367,7 +1367,7 @@ static inline void do_trace_initcall_level(const char *level)
 
 int __init_or_module do_one_initcall(initcall_t fn)
 {
-	int count = preempt_count();
+	long count = preempt_count();
 	char msgbuf[64];
 	int ret;
 
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 88c594c6d7fc..2ad9365915eb 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -122,6 +122,10 @@ config PREEMPT_RT_NEEDS_BH_LOCK
 config PREEMPT_COUNT
        bool
 
+config PREEMPT_LONG
+	bool
+	depends on PREEMPT_COUNT && 64BIT
+
 config PREEMPTION
        bool
        select PREEMPT_COUNT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b411e4feff7f..f54dd3cb66f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5709,7 +5709,7 @@ static inline void sched_tick_stop(int cpu) { }
  * If the value passed in is equal to the current preempt count
  * then we just disabled preemption. Start timing the latency.
  */
-static inline void preempt_latency_start(int val)
+static inline void preempt_latency_start(long val)
 {
 	if (preempt_count() == val) {
 		unsigned long ip = get_lock_parent_ip();
@@ -5746,7 +5746,7 @@ NOKPROBE_SYMBOL(preempt_count_add);
  * If the value passed in equals to the current preempt count
  * then we just enabled preemption. Stop timing the latency.
  */
-static inline void preempt_latency_stop(int val)
+static inline void preempt_latency_stop(long val)
 {
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
@@ -8774,7 +8774,7 @@ void __might_sleep(const char *file, int line)
 }
 EXPORT_SYMBOL(__might_sleep);
 
-static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+static void print_preempt_disable_ip(long preempt_offset, unsigned long ip)
 {
 	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
 		return;
@@ -8846,7 +8846,7 @@ void __might_resched(const char *file, int line, unsigned int offsets)
 }
 EXPORT_SYMBOL(__might_resched);
 
-void __cant_sleep(const char *file, int line, int preempt_offset)
+void __cant_sleep(const char *file, int line, long preempt_offset)
 {
 	static unsigned long prev_jiffy;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 77198911b8dd..51a7f391edab 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -88,6 +88,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
 EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
 #endif
 
+#ifndef CONFIG_PREEMPT_LONG
+/*
+ * Any 32bit architecture that still cares about performance should
+ * probably ensure this is near preempt_count.
+ */
+DEFINE_PER_CPU(unsigned int, nmi_nesting);
+#endif
+
 /*
  * SOFTIRQ_OFFSET usage:
  *
@@ -609,7 +617,7 @@ static void handle_softirqs(bool ksirqd)
 
 	while ((softirq_bit = ffs(pending))) {
 		unsigned int vec_nr;
-		int prev_count;
+		long prev_count;
 
 		h += softirq_bit - 1;
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1f2364126894..89c348139218 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1723,7 +1723,7 @@ static void call_timer_fn(struct timer_list *timer,
 			  void (*fn)(struct timer_list *),
 			  unsigned long baseclk)
 {
-	int count = preempt_count();
+	long count = preempt_count();
 
 #ifdef CONFIG_LOCKDEP
 	/*
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index d939403331b5..8fd216bd0be6 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1429,7 +1429,7 @@ static int unexpected_testcase_failures;
 
 static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 {
-	int saved_preempt_count = preempt_count();
+	long saved_preempt_count = preempt_count();
 #ifdef CONFIG_PREEMPT_RT
 #ifdef CONFIG_SMP
 	int saved_mgd_count = current->migration_disabled;

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 1 day, 8 hours ago

On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
[...]
>  DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index d57cab4d4c06..77defd9624bf 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -108,15 +108,14 @@ void irq_exit_rcu(void);
>  	do {							\
>  		lockdep_off();					\
>  		arch_nmi_enter();				\
> -		BUG_ON(in_nmi() == NMI_MASK);			\
> -		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_enter();			\
>  	} while (0)
>  
>  #define nmi_enter()						\
>  	do {							\
>  		__nmi_enter();					\
>  		lockdep_hardirq_enter();			\
> -		ct_nmi_enter();				\
> +		ct_nmi_enter();					\
>  		instrumentation_begin();			\
>  		ftrace_nmi_enter();				\
>  		instrumentation_end();				\
> @@ -125,7 +124,7 @@ void irq_exit_rcu(void);
>  #define __nmi_exit()						\
>  	do {							\
>  		BUG_ON(!in_nmi());				\
> -		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_exit();			\
>  		arch_nmi_exit();				\
>  		lockdep_on();					\
>  	} while (0)
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index d964f965c8ff..7617ca97f442 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
[...]
> @@ -132,6 +145,27 @@ static __always_inline unsigned char interrupt_context_level(void)
>  # define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
>  #endif
>  
> +#ifndef CONFIG_PREEMPT_LONG
> +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> +
> +#define __preempt_count_nmi_enter()				\
> +	do {							\
> +		unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;	\
> +		__this_cpu_inc(nmi_nesting);			\
> +		_o -= (preempt_count() & NMI_MASK);		\
> +		__preempt_count_add(_o);			\
> +	} while (0)
> +
> +#define __preempt_count_nmi_exit()				\
> +	do {							\
> +		unsigned int _o = HARDIRQ_OFFSET;		\
> +		if (!__this_cpu_dec_return(nmi_nesting))	\
> +			_o += NMI_MASK;				\
> +		__preempt_count_sub(_o);			\
> +	} while (0)
> +
> +#endif
> +

We need to move it into include/linux/hardirq.h because percpu is not
included in <linux/preempt.h>.

Regards,
Boqun

>  /*
>   * The following macros are deprecated and should not be used in new code:
>   * in_softirq()   - We have BH disabled, or are processing softirqs

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 21 hours ago

On Thu, Feb 05, 2026 at 02:07:40PM -0800, Boqun Feng wrote:
> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
> [...]
> >  DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
> > diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> > index d57cab4d4c06..77defd9624bf 100644
> > --- a/include/linux/hardirq.h
> > +++ b/include/linux/hardirq.h
> > @@ -108,15 +108,14 @@ void irq_exit_rcu(void);
> >  	do {							\
> >  		lockdep_off();					\
> >  		arch_nmi_enter();				\
> > -		BUG_ON(in_nmi() == NMI_MASK);			\
> > -		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
> > +		__preempt_count_nmi_enter();			\
> >  	} while (0)
> >  
> >  #define nmi_enter()						\
> >  	do {							\
> >  		__nmi_enter();					\
> >  		lockdep_hardirq_enter();			\
> > -		ct_nmi_enter();				\
> > +		ct_nmi_enter();					\
> >  		instrumentation_begin();			\
> >  		ftrace_nmi_enter();				\
> >  		instrumentation_end();				\
> > @@ -125,7 +124,7 @@ void irq_exit_rcu(void);
> >  #define __nmi_exit()						\
> >  	do {							\
> >  		BUG_ON(!in_nmi());				\
> > -		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
> > +		__preempt_count_nmi_exit();			\
> >  		arch_nmi_exit();				\
> >  		lockdep_on();					\
> >  	} while (0)
> > diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> > index d964f965c8ff..7617ca97f442 100644
> > --- a/include/linux/preempt.h
> > +++ b/include/linux/preempt.h
> [...]
> > @@ -132,6 +145,27 @@ static __always_inline unsigned char interrupt_context_level(void)
> >  # define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
> >  #endif
> >  
> > +#ifndef CONFIG_PREEMPT_LONG
> > +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> > +
> > +#define __preempt_count_nmi_enter()				\
> > +	do {							\
> > +		unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;	\
> > +		__this_cpu_inc(nmi_nesting);			\
> > +		_o -= (preempt_count() & NMI_MASK);		\
> > +		__preempt_count_add(_o);			\
> > +	} while (0)
> > +
> > +#define __preempt_count_nmi_exit()				\
> > +	do {							\
> > +		unsigned int _o = HARDIRQ_OFFSET;		\
> > +		if (!__this_cpu_dec_return(nmi_nesting))	\
> > +			_o += NMI_MASK;				\
> > +		__preempt_count_sub(_o);			\
> > +	} while (0)
> > +
> > +#endif
> > +
> 
> We need to move it into include/linux/hardirq.h because percpu is not
> included in <linux/preempt.h>.

That is fine. I also realized you can move the variants from
arch/x86/asm/preempt.h right next to it, it only depends on
PREEMPT_LONG, not anything else, so there is nothing arch specific to
it.

Avoids that getting duplicated on arm64,s390 etc.

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 1 day, 8 hours ago

On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> > But I'm really somewhat sad that 64bit can't do better than this.
> 
> Here, the below builds and boots (albeit with warnings because printf
> format crap sucks).
> 

Thanks! I will drop patch #1 and #2 and use this one (with a commit log
and some more tests), given it's based on the work of Joel, Lyude and
me, would the following tags make sense to all of you?

Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-by: Joel Fernandes <joelagnelf@nvidia.com>
Co-developed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Lyude Paul <lyude@redhat.com>
Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org> 
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Boqun Feng <boqun@kernel.org>

Regards,
Boqun

> ---
>  arch/x86/Kconfig               |  1 +
>  arch/x86/include/asm/preempt.h | 53 ++++++++++++++++++++++++++++++------------
>  arch/x86/kernel/cpu/common.c   |  2 +-
>  include/linux/hardirq.h        |  7 +++---
>  include/linux/preempt.h        | 52 ++++++++++++++++++++++++++++++++++-------
>  init/main.c                    |  2 +-
>  kernel/Kconfig.preempt         |  4 ++++
>  kernel/sched/core.c            |  8 +++----
>  kernel/softirq.c               | 10 +++++++-
>  kernel/time/timer.c            |  2 +-
>  lib/locking-selftest.c         |  2 +-
>  11 files changed, 106 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 80527299f859..2bd1972fd4c7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -326,6 +326,7 @@ config X86
>  	select USER_STACKTRACE_SUPPORT
>  	select HAVE_ARCH_KCSAN			if X86_64
>  	select PROC_PID_ARCH_STATUS		if PROC_FS
> +	select PREEMPT_LONG			if X86_64
>  	select HAVE_ARCH_NODE_DEV_GROUP		if X86_SGX
>  	select FUNCTION_ALIGNMENT_16B		if X86_64 || X86_ALIGNMENT_16
>  	select FUNCTION_ALIGNMENT_4B
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 578441db09f0..1b54d5555138 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -7,10 +7,19 @@
>  
>  #include <linux/static_call_types.h>
>  
> -DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
> +DECLARE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count);
>  
> -/* We use the MSB mostly because its available */
> -#define PREEMPT_NEED_RESCHED	0x80000000
> +/*
> + * We use the MSB for PREEMPT_NEED_RESCHED mostly because it is available.
> + */
> +
> +#ifdef CONFIG_64BIT
> +#define PREEMPT_NEED_RESCHED	(~((-1L) >> 1))
> +#define __pc_op(op, ...)	raw_cpu_##op##_8(__VA_ARGS__)
> +#else
> +#define PREEMPT_NEED_RESCHED	(~((-1) >> 1))
> +#define __pc_op(op, ...)	raw_cpu_##op##_4(__VA_ARGS__)
> +#endif
>  
>  /*
>   * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
> @@ -24,18 +33,18 @@ DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
>   */
>  static __always_inline int preempt_count(void)
>  {
> -	return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
> +	return __pc_op(read, __preempt_count) & ~PREEMPT_NEED_RESCHED;
>  }
>  
> -static __always_inline void preempt_count_set(int pc)
> +static __always_inline void preempt_count_set(long pc)
>  {
>  	int old, new;
>  
> -	old = raw_cpu_read_4(__preempt_count);
> +	old = __pc_op(read, __preempt_count);
>  	do {
>  		new = (old & PREEMPT_NEED_RESCHED) |
>  			(pc & ~PREEMPT_NEED_RESCHED);
> -	} while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
> +	} while (!__pc_op(try_cmpxchg, __preempt_count, &old, new));
>  }
>  
>  /*
> @@ -58,33 +67,45 @@ static __always_inline void preempt_count_set(int pc)
>  
>  static __always_inline void set_preempt_need_resched(void)
>  {
> -	raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
> +	__pc_op(and, __preempt_count, ~PREEMPT_NEED_RESCHED);
>  }
>  
>  static __always_inline void clear_preempt_need_resched(void)
>  {
> -	raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
> +	__pc_op(or, __preempt_count, PREEMPT_NEED_RESCHED);
>  }
>  
>  static __always_inline bool test_preempt_need_resched(void)
>  {
> -	return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
> +	return !(__pc_op(read, __preempt_count) & PREEMPT_NEED_RESCHED);
>  }
>  
>  /*
>   * The various preempt_count add/sub methods
>   */
>  
> -static __always_inline void __preempt_count_add(int val)
> +static __always_inline void __preempt_count_add(long val)
>  {
> -	raw_cpu_add_4(__preempt_count, val);
> +	__pc_op(add, __preempt_count, val);
>  }
>  
> -static __always_inline void __preempt_count_sub(int val)
> +static __always_inline void __preempt_count_sub(long val)
>  {
> -	raw_cpu_add_4(__preempt_count, -val);
> +	__pc_op(add, __preempt_count, -val);
>  }
>  
> +#ifdef CONFIG_64BIT
> +static __always_inline void __preempt_count_nmi_enter(void)
> +{
> +	__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);
> +}
> +
> +static __always_inline void __preempt_count_nmi_exit(void)
> +{
> +	__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);
> +}
> +#endif
> +
>  /*
>   * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
>   * a decrement which hits zero means we have no preempt_count and should
> @@ -101,7 +122,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
>   */
>  static __always_inline bool should_resched(int preempt_offset)
>  {
> -	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
> +	return unlikely(__pc_op(read, __preempt_count) == preempt_offset);
>  }
>  
>  #ifdef CONFIG_PREEMPTION
> @@ -148,4 +169,6 @@ do { \
>  
>  #endif /* PREEMPTION */
>  
> +#undef __pc_op
> +
>  #endif /* __ASM_PREEMPT_H */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index e7ab22fce3b5..9d3602f085c9 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -2219,7 +2219,7 @@ DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
>  EXPORT_PER_CPU_SYMBOL(current_task);
>  EXPORT_PER_CPU_SYMBOL(const_current_task);
>  
> -DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
> +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __preempt_count) = INIT_PREEMPT_COUNT;
>  EXPORT_PER_CPU_SYMBOL(__preempt_count);
>  
>  DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index d57cab4d4c06..77defd9624bf 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -108,15 +108,14 @@ void irq_exit_rcu(void);
>  	do {							\
>  		lockdep_off();					\
>  		arch_nmi_enter();				\
> -		BUG_ON(in_nmi() == NMI_MASK);			\
> -		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_enter();			\
>  	} while (0)
>  
>  #define nmi_enter()						\
>  	do {							\
>  		__nmi_enter();					\
>  		lockdep_hardirq_enter();			\
> -		ct_nmi_enter();				\
> +		ct_nmi_enter();					\
>  		instrumentation_begin();			\
>  		ftrace_nmi_enter();				\
>  		instrumentation_end();				\
> @@ -125,7 +124,7 @@ void irq_exit_rcu(void);
>  #define __nmi_exit()						\
>  	do {							\
>  		BUG_ON(!in_nmi());				\
> -		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
> +		__preempt_count_nmi_exit();			\
>  		arch_nmi_exit();				\
>  		lockdep_on();					\
>  	} while (0)
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index d964f965c8ff..7617ca97f442 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -17,6 +17,9 @@
>   *
>   * - bits 0-7 are the preemption count (max preemption depth: 256)
>   * - bits 8-15 are the softirq count (max # of softirqs: 256)
> + * - bits 16-23 are the hardirq disable count (max # of hardirq disable: 256)
> + * - bits 24-27 are the hardirq count (max # of hardirqs: 16)
> + * - bit 28 is the NMI flag (no nesting count, tracked separately)
>   *
>   * The hardirq count could in theory be the same as the number of
>   * interrupts in the system, but we run all interrupt handlers with
> @@ -24,31 +27,41 @@
>   * there are a few palaeontologic drivers which reenable interrupts in
>   * the handler, so we need more than one bit here.
>   *
> - *         PREEMPT_MASK:	0x000000ff
> - *         SOFTIRQ_MASK:	0x0000ff00
> - *         HARDIRQ_MASK:	0x000f0000
> - *             NMI_MASK:	0x00f00000
> - * PREEMPT_NEED_RESCHED:	0x80000000
> + * NMI nesting depth is tracked in a separate per-CPU variable
> + * (nmi_nesting) to save bits in preempt_count.
> + *
> + *				32bit		64bit + PREEMPT_LONG
> + *
> + *         PREEMPT_MASK:	0x000000ff	0x00000000000000ff
> + *         SOFTIRQ_MASK:	0x0000ff00	0x000000000000ff00
> + * HARDIRQ_DISABLE_MASK:	0x00ff0000	0x0000000000ff0000
> + *         HARDIRQ_MASK:	0x0f000000	0x000000000f000000
> + *             NMI_MASK:	0x10000000	0x00000000f0000000
> + * PREEMPT_NEED_RESCHED:	0x80000000	0x8000000000000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
> +#define HARDIRQ_DISABLE_BITS	8
>  #define HARDIRQ_BITS	4
> -#define NMI_BITS	4
> +#define NMI_BITS	(1 + 3*IS_ENABLED(CONFIG_PREEMPT_LONG))
>  
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
> -#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDIRQ_DISABLE_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDIRQ_SHIFT	(HARDIRQ_DISABLE_SHIFT + HARDIRQ_DISABLE_BITS)
>  #define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
>  
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
>  
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
> +#define HARDIRQ_DISABLE_MASK	(__IRQ_MASK(HARDIRQ_DISABLE_BITS) << HARDIRQ_DISABLE_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
>  #define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
>  
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
> +#define HARDIRQ_DISABLE_OFFSET	(1UL << HARDIRQ_DISABLE_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
>  #define NMI_OFFSET	(1UL << NMI_SHIFT)
>  
> @@ -105,8 +118,8 @@ static __always_inline unsigned char interrupt_context_level(void)
>   * preempt_count() is commonly implemented with READ_ONCE().
>   */
>  
> -#define nmi_count()	(preempt_count() & NMI_MASK)
> -#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
> +#define nmi_count()		(preempt_count() & NMI_MASK)
> +#define hardirq_count()		(preempt_count() & HARDIRQ_MASK)
>  #ifdef CONFIG_PREEMPT_RT
>  # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
>  # define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
> @@ -132,6 +145,27 @@ static __always_inline unsigned char interrupt_context_level(void)
>  # define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
>  #endif
>  
> +#ifndef CONFIG_PREEMPT_LONG
> +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> +
> +#define __preempt_count_nmi_enter()				\
> +	do {							\
> +		unsigned int _o = NMI_MASK + HARDIRQ_OFFSET;	\
> +		__this_cpu_inc(nmi_nesting);			\
> +		_o -= (preempt_count() & NMI_MASK);		\
> +		__preempt_count_add(_o);			\
> +	} while (0)
> +
> +#define __preempt_count_nmi_exit()				\
> +	do {							\
> +		unsigned int _o = HARDIRQ_OFFSET;		\
> +		if (!__this_cpu_dec_return(nmi_nesting))	\
> +			_o += NMI_MASK;				\
> +		__preempt_count_sub(_o);			\
> +	} while (0)
> +
> +#endif
> +
>  /*
>   * The following macros are deprecated and should not be used in new code:
>   * in_softirq()   - We have BH disabled, or are processing softirqs
> diff --git a/init/main.c b/init/main.c
> index b84818ad9685..f8f4b78b7a06 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -1367,7 +1367,7 @@ static inline void do_trace_initcall_level(const char *level)
>  
>  int __init_or_module do_one_initcall(initcall_t fn)
>  {
> -	int count = preempt_count();
> +	long count = preempt_count();
>  	char msgbuf[64];
>  	int ret;
>  
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index 88c594c6d7fc..2ad9365915eb 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -122,6 +122,10 @@ config PREEMPT_RT_NEEDS_BH_LOCK
>  config PREEMPT_COUNT
>         bool
>  
> +config PREEMPT_LONG
> +	bool
> +	depends on PREEMPT_COUNT && 64BIT
> +
>  config PREEMPTION
>         bool
>         select PREEMPT_COUNT
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b411e4feff7f..f54dd3cb66f2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5709,7 +5709,7 @@ static inline void sched_tick_stop(int cpu) { }
>   * If the value passed in is equal to the current preempt count
>   * then we just disabled preemption. Start timing the latency.
>   */
> -static inline void preempt_latency_start(int val)
> +static inline void preempt_latency_start(long val)
>  {
>  	if (preempt_count() == val) {
>  		unsigned long ip = get_lock_parent_ip();
> @@ -5746,7 +5746,7 @@ NOKPROBE_SYMBOL(preempt_count_add);
>   * If the value passed in equals to the current preempt count
>   * then we just enabled preemption. Stop timing the latency.
>   */
> -static inline void preempt_latency_stop(int val)
> +static inline void preempt_latency_stop(long val)
>  {
>  	if (preempt_count() == val)
>  		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
> @@ -8774,7 +8774,7 @@ void __might_sleep(const char *file, int line)
>  }
>  EXPORT_SYMBOL(__might_sleep);
>  
> -static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
> +static void print_preempt_disable_ip(long preempt_offset, unsigned long ip)
>  {
>  	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
>  		return;
> @@ -8846,7 +8846,7 @@ void __might_resched(const char *file, int line, unsigned int offsets)
>  }
>  EXPORT_SYMBOL(__might_resched);
>  
> -void __cant_sleep(const char *file, int line, int preempt_offset)
> +void __cant_sleep(const char *file, int line, long preempt_offset)
>  {
>  	static unsigned long prev_jiffy;
>  
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 77198911b8dd..51a7f391edab 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -88,6 +88,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
>  EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
>  #endif
>  
> +#ifndef CONFIG_PREEMPT_LONG
> +/*
> + * Any 32bit architecture that still cares about performance should
> + * probably ensure this is near preempt_count.
> + */
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> +#endif
> +
>  /*
>   * SOFTIRQ_OFFSET usage:
>   *
> @@ -609,7 +617,7 @@ static void handle_softirqs(bool ksirqd)
>  
>  	while ((softirq_bit = ffs(pending))) {
>  		unsigned int vec_nr;
> -		int prev_count;
> +		long prev_count;
>  
>  		h += softirq_bit - 1;
>  
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index 1f2364126894..89c348139218 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -1723,7 +1723,7 @@ static void call_timer_fn(struct timer_list *timer,
>  			  void (*fn)(struct timer_list *),
>  			  unsigned long baseclk)
>  {
> -	int count = preempt_count();
> +	long count = preempt_count();
>  
>  #ifdef CONFIG_LOCKDEP
>  	/*
> diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
> index d939403331b5..8fd216bd0be6 100644
> --- a/lib/locking-selftest.c
> +++ b/lib/locking-selftest.c
> @@ -1429,7 +1429,7 @@ static int unexpected_testcase_failures;
>  
>  static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
>  {
> -	int saved_preempt_count = preempt_count();
> +	long saved_preempt_count = preempt_count();
>  #ifdef CONFIG_PREEMPT_RT
>  #ifdef CONFIG_SMP
>  	int saved_mgd_count = current->migration_disabled;

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 1 day, 8 hours ago


On 2/5/2026 4:40 PM, Boqun Feng wrote:
> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
>>> But I'm really somewhat sad that 64bit can't do better than this.
>>
>> Here, the below builds and boots (albeit with warnings because printf
>> format crap sucks).
>>
> 
> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
> and some more tests), given it's based on the work of Joel, Lyude and
> me, would the following tags make sense to all of you?
> > Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>

I don't know, I am not a big fan of the alternative patch because it adds a
per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
patch than the one I wrote. Purely from an objective perspective, I would still
want to keep my original patch because it is simple. What is really the
objection to it?

[1]
+#ifndef CONFIG_PREEMPT_LONG
+/*
+ * Any 32bit architecture that still cares about performance should
+ * probably ensure this is near preempt_count.
+ */
+DEFINE_PER_CPU(unsigned int, nmi_nesting);
+#endif

Thanks,
--
Joel Fernandes

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 1 day, 5 hours ago


On 2/5/2026 5:17 PM, Joel Fernandes wrote:
> 
> 
> On 2/5/2026 4:40 PM, Boqun Feng wrote:
>> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
>>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
>>>> But I'm really somewhat sad that 64bit can't do better than this.
>>>
>>> Here, the below builds and boots (albeit with warnings because printf
>>> format crap sucks).
>>>
>>
>> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
>> and some more tests), given it's based on the work of Joel, Lyude and
>> me, would the following tags make sense to all of you?
>>> Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
> 
> I don't know, I am not a big fan of the alternative patch because it adds a
> per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
> patch than the one I wrote. Purely from an objective perspective, I would still
> want to keep my original patch because it is simple. What is really the
> objection to it?
> 
> [1]
> +#ifndef CONFIG_PREEMPT_LONG
> +/*
> + * Any 32bit architecture that still cares about performance should
> + * probably ensure this is near preempt_count.
> + */
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> +#endif
> 
If the objection to my patch is modifying a per-cpu counter, isn't NMI a slow
path? If we agree, then keeping things simple is better IMO unless we have data
showing that it is an issue. This is code is already quite convoluted, let us
not convolute it more with 32-bit specific things.

I had tried moving it to DEFINE_PER_CPU_CACHE_HOT, but ISTR that did not work
out (I think something about a limit to how many things could be moved to cache
hot).

Happy to revise patch again with any other suggestions,

--
Joel Fernandes

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 1 day, 5 hours ago

On Thu, Feb 05, 2026 at 07:50:03PM -0500, Joel Fernandes wrote:
> 
> 
> On 2/5/2026 5:17 PM, Joel Fernandes wrote:
> > 
> > 
> > On 2/5/2026 4:40 PM, Boqun Feng wrote:
> >> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
> >>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> >>>> But I'm really somewhat sad that 64bit can't do better than this.
> >>>
> >>> Here, the below builds and boots (albeit with warnings because printf
> >>> format crap sucks).
> >>>
> >>
> >> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
> >> and some more tests), given it's based on the work of Joel, Lyude and
> >> me, would the following tags make sense to all of you?
> >>> Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
> > 
> > I don't know, I am not a big fan of the alternative patch because it adds a
> > per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
> > patch than the one I wrote. Purely from an objective perspective, I would still
> > want to keep my original patch because it is simple. What is really the
> > objection to it?
> > 

PREEMPT_LONG is an architecture-specific way to improve the performance
IMO. Just to be clear, do you object it at all, or do you object
combining it with your original patch? If it's the latter, I could make
another patch as a follow to enable PREEMPT_LONG.

> > [1]
> > +#ifndef CONFIG_PREEMPT_LONG
> > +/*
> > + * Any 32bit architecture that still cares about performance should
> > + * probably ensure this is near preempt_count.
> > + */
> > +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> > +#endif
> > 
> If the objection to my patch is modifying a per-cpu counter, isn't NMI a slow
> path? If we agree, then keeping things simple is better IMO unless we have data

I guess Peter was trying to say it's not a slow path if you consider
perf event interrupts on x86? [1]

> showing that it is an issue. This is code is already quite convoluted, let us
> not convolute it more with 32-bit specific things.
> 
> I had tried moving it to DEFINE_PER_CPU_CACHE_HOT, but ISTR that did not work
> out (I think something about a limit to how many things could be moved to cache
> hot).
> 
> Happy to revise patch again with any other suggestions,
> 

[1]: https://lore.kernel.org/rust-for-linux/20260204130027.GE3016024@noisy.programming.kicks-ass.net/

Regards,
Boqun

> --
> Joel Fernandes
>

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 1 day, 5 hours ago


On 2/5/2026 8:14 PM, Boqun Feng wrote:
> On Thu, Feb 05, 2026 at 07:50:03PM -0500, Joel Fernandes wrote:
>>
>>
>> On 2/5/2026 5:17 PM, Joel Fernandes wrote:
>>>
>>>
>>> On 2/5/2026 4:40 PM, Boqun Feng wrote:
>>>> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
>>>>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
>>>>>> But I'm really somewhat sad that 64bit can't do better than this.
>>>>>
>>>>> Here, the below builds and boots (albeit with warnings because printf
>>>>> format crap sucks).
>>>>>
>>>>
>>>> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
>>>> and some more tests), given it's based on the work of Joel, Lyude and
>>>> me, would the following tags make sense to all of you?
>>>>> Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
>>>
>>> I don't know, I am not a big fan of the alternative patch because it adds a
>>> per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
>>> patch than the one I wrote. Purely from an objective perspective, I would still
>>> want to keep my original patch because it is simple. What is really the
>>> objection to it?
>>>
> 
> PREEMPT_LONG is an architecture-specific way to improve the performance
> IMO. Just to be clear, do you object it at all, or do you object
> combining it with your original patch? If it's the latter, I could make
> another patch as a follow to enable PREEMPT_LONG.

When I looked at the alternative patch, I did consider that it was
overcomplicated and it should be justified. Otherwise, I don't object to it. It
seems to be a matter of preference I think. I would prefer a simpler fix than an
overcomplicated fix for a hypothetical issue (unless we have data showing
issue). If it was a few lines of change, that'd be different story.

> 
>>> [1]
>>> +#ifndef CONFIG_PREEMPT_LONG
>>> +/*
>>> + * Any 32bit architecture that still cares about performance should
>>> + * probably ensure this is near preempt_count.
>>> + */
>>> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
>>> +#endif
>>>
>> If the objection to my patch is modifying a per-cpu counter, isn't NMI a slow
>> path? If we agree, then keeping things simple is better IMO unless we have data
> 
> I guess Peter was trying to say it's not a slow path if you consider
> perf event interrupts on x86? [1]

How are we handling this performance issue then on 32-bit x86 architecture with
perf? Or are we saying we don't care about performance on 32-bit?

-- 
Joel Fernandes

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 21 hours ago

On Thu, Feb 05, 2026 at 08:24:40PM -0500, Joel Fernandes wrote:

> > I guess Peter was trying to say it's not a slow path if you consider
> > perf event interrupts on x86? [1]
> 
> How are we handling this performance issue then on 32-bit x86 architecture with
> perf? Or are we saying we don't care about performance on 32-bit?

Yeah, in general I don't consider any 32bit architecture performance
critical at this point. Its pure legacy code, to be removed at some
point.

To x86_32 in particular, we make it limp along. It sorta builds and
sorta boots but meh. It doesn't even have most of the speculation fixes.
You really, as in *REALLY* should not be running a x86_32 kernel.

I mean, if you still want to run Linux on your museum grade Pentium-II
processor, don't let me stop you. Just don't expect miracles.

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 1 day, 3 hours ago

On Thu, Feb 05, 2026 at 08:24:40PM -0500, Joel Fernandes wrote:
> 
> 
> On 2/5/2026 8:14 PM, Boqun Feng wrote:
> > On Thu, Feb 05, 2026 at 07:50:03PM -0500, Joel Fernandes wrote:
> >>
> >>
> >> On 2/5/2026 5:17 PM, Joel Fernandes wrote:
> >>>
> >>>
> >>> On 2/5/2026 4:40 PM, Boqun Feng wrote:
> >>>> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
> >>>>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> >>>>>> But I'm really somewhat sad that 64bit can't do better than this.
> >>>>>
> >>>>> Here, the below builds and boots (albeit with warnings because printf
> >>>>> format crap sucks).
> >>>>>
> >>>>
> >>>> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
> >>>> and some more tests), given it's based on the work of Joel, Lyude and
> >>>> me, would the following tags make sense to all of you?
> >>>>> Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
> >>>
> >>> I don't know, I am not a big fan of the alternative patch because it adds a
> >>> per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
> >>> patch than the one I wrote. Purely from an objective perspective, I would still
> >>> want to keep my original patch because it is simple. What is really the
> >>> objection to it?
> >>>
> > 
> > PREEMPT_LONG is an architecture-specific way to improve the performance
> > IMO. Just to be clear, do you object it at all, or do you object
> > combining it with your original patch? If it's the latter, I could make
> > another patch as a follow to enable PREEMPT_LONG.
> 
> When I looked at the alternative patch, I did consider that it was
> overcomplicated and it should be justified. Otherwise, I don't object to it. It

I don't think that's overcomplicated. Note that people have different
goals, for us (you, Lyude and me), we want to have a safer
interrupt-disabling lock API, hence this patchset. I think Peter on the
other hand while agreeing with us on the necessity, but wants to avoid
potential performance lost (maybe in general also likes the idea of
preempt_count being 64bit on 64bit machines ;-)) That patch looks
"overcomplicated" because it contains both goals (it actually contains
patch #1 and #2 along with the improvement). If you look them
separately, it would be not that complicated (Peter's diff against patch
1 + 2 will be relatively small).

> seems to be a matter of preference I think. I would prefer a simpler fix than an
> overcomplicated fix for a hypothetical issue (unless we have data showing
> issue). If it was a few lines of change, that'd be different story.
> 
> > 
> >>> [1]
> >>> +#ifndef CONFIG_PREEMPT_LONG
> >>> +/*
> >>> + * Any 32bit architecture that still cares about performance should
> >>> + * probably ensure this is near preempt_count.
> >>> + */
> >>> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> >>> +#endif
> >>>
> >> If the objection to my patch is modifying a per-cpu counter, isn't NMI a slow
> >> path? If we agree, then keeping things simple is better IMO unless we have data
> > 
> > I guess Peter was trying to say it's not a slow path if you consider
> > perf event interrupts on x86? [1]
> 
> How are we handling this performance issue then on 32-bit x86 architecture with
> perf? Or are we saying we don't care about performance on 32-bit?
> 

I'm not in the position to answer this (mostly for the second question).
Either we have data proving that the performance gap caused by your
original patch is small enough (if there is any) or it's up to x86
maintainers.

Regards,
Boqun

> -- 
> Joel Fernandes
>

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 22 hours ago

> On Feb 5, 2026, at 9:51 PM, Boqun Feng <boqun@kernel.org> wrote:
> 
> On Thu, Feb 05, 2026 at 08:24:40PM -0500, Joel Fernandes wrote:
>> 
>> 
>>> On 2/5/2026 8:14 PM, Boqun Feng wrote:
>>> On Thu, Feb 05, 2026 at 07:50:03PM -0500, Joel Fernandes wrote:
>>>> 
>>>> 
>>>> On 2/5/2026 5:17 PM, Joel Fernandes wrote:
>>>>> 
>>>>> 
>>>>> On 2/5/2026 4:40 PM, Boqun Feng wrote:
>>>>>> On Wed, Feb 04, 2026 at 12:12:34PM +0100, Peter Zijlstra wrote:
>>>>>>> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
>>>>>>>> But I'm really somewhat sad that 64bit can't do better than this.
>>>>>>> 
>>>>>>> Here, the below builds and boots (albeit with warnings because printf
>>>>>>> format crap sucks).
>>>>>>> 
>>>>>> 
>>>>>> Thanks! I will drop patch #1 and #2 and use this one (with a commit log
>>>>>> and some more tests), given it's based on the work of Joel, Lyude and
>>>>>> me, would the following tags make sense to all of you?
>>>>>>> Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
>>>>> 
>>>>> I don't know, I am not a big fan of the alternative patch because it adds a
>>>>> per-cpu counter anyway if !CONFIG_PREEMPT_LONG [1]. And it is also a much bigger
>>>>> patch than the one I wrote. Purely from an objective perspective, I would still
>>>>> want to keep my original patch because it is simple. What is really the
>>>>> objection to it?
>>>>> 
>>> 
>>> PREEMPT_LONG is an architecture-specific way to improve the performance
>>> IMO. Just to be clear, do you object it at all, or do you object
>>> combining it with your original patch? If it's the latter, I could make
>>> another patch as a follow to enable PREEMPT_LONG.
>> 
>> When I looked at the alternative patch, I did consider that it was
>> overcomplicated and it should be justified. Otherwise, I don't object to it. It
> 
> I don't think that's overcomplicated. Note that people have different
> goals, for us (you, Lyude and me), we want to have a safer
> interrupt-disabling lock API, hence this patchset.

I was also coming from the goal of long term kernel code maintainability. If
we decide to have additional preempt count flags in the future, does special
casing 32 bit add even more complexity? (not rhetorical, really asking)

> I think Peter on the
> other hand while agreeing with us on the necessity, but wants to avoid
> potential performance lost (maybe in general also likes the idea of
> preempt_count being 64bit on 64bit machines ;-)) That patch looks
> "overcomplicated" because it contains both goals (it actually contains
> patch #1 and #2 along with the improvement). If you look them
> separately, it would be not that complicated (Peter's diff against patch
> 1 + 2 will be relatively small).

On further looking, I think my hesitation is mostly around the extra
config option and special casing of 32 bit as mentioned above. But
answering your other question, if it is decided to go with Peter's
patch, you can use my codevelop tag.

-- 
Joel Fernandes

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 15 hours ago

On Fri, Feb 06, 2026 at 03:13:40AM -0500, Joel Fernandes wrote:
[..]
> >>> 
> >>> PREEMPT_LONG is an architecture-specific way to improve the performance
> >>> IMO. Just to be clear, do you object it at all, or do you object
> >>> combining it with your original patch? If it's the latter, I could make
> >>> another patch as a follow to enable PREEMPT_LONG.
> >> 
> >> When I looked at the alternative patch, I did consider that it was
> >> overcomplicated and it should be justified. Otherwise, I don't object to it. It
> > 
> > I don't think that's overcomplicated. Note that people have different
> > goals, for us (you, Lyude and me), we want to have a safer
> > interrupt-disabling lock API, hence this patchset.
> 
> I was also coming from the goal of long term kernel code maintainability. If
> we decide to have additional preempt count flags in the future, does special
> casing 32 bit add even more complexity? (not rhetorical, really asking)
> 

First, given what preempt count is, I don't think that'll happen
frequently. Also I think the reality is that we care about 64bit
performance more than 32bit, in that sense, if this "conditional 32 bit
preempt count case" becomes an issue, the reasonable action to me is
just making all preempt count 64bit (using an irq disabling critical
section for 32bit or plus a special locking), and this would make things
simpler. That's the long term view from me. (Now think about this, the
NMI tracking we proposed in this patch is actually a special case of
that ;-))

> > I think Peter on the
> > other hand while agreeing with us on the necessity, but wants to avoid
> > potential performance lost (maybe in general also likes the idea of
> > preempt_count being 64bit on 64bit machines ;-)) That patch looks
> > "overcomplicated" because it contains both goals (it actually contains
> > patch #1 and #2 along with the improvement). If you look them
> > separately, it would be not that complicated (Peter's diff against patch
> > 1 + 2 will be relatively small).
> 
> On further looking, I think my hesitation is mostly around the extra
> config option and special casing of 32 bit as mentioned above. But
> answering your other question, if it is decided to go with Peter's
> patch, you can use my codevelop tag.
> 

Thank you! But I realized more things are needed, so we probably should
add PREEMPT_LONG as a follow-up (for example, should_resched() should
take a long instead of int, and the print format issues that Peter
mentioned).

Regards,
Boqun

> -- 
> Joel Fernandes
>

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 14 hours ago

On 2/6/2026 10:28 AM, Boqun Feng wrote:
>> I was also coming from the goal of long term kernel code maintainability. If
>> we decide to have additional preempt count flags in the future, does special
>> casing 32 bit add even more complexity? (not rhetorical, really asking)
>>
> First, given what preempt count is, I don't think that'll happen
> frequently.

Not sure I buy the argument of not happening frequently. I don't think any of us
have a crystal ball. There are cases in the future that can come up IMO.

> Also I think the reality is that we care about 64bit> performance more than
32bit, in that sense, if this "conditional 32 bit
> preempt count case" becomes an issue, the reasonable action to me is
> just making all preempt count 64bit 

You might be missing something here. You can't make all of preempt count 64 bit,
that's the point, it doesn't work. That's why Peter did what he did to
special-case 32 bit. See:
https://lore.kernel.org/all/20251020204421.GA197647@joelbox2/

That said, I am ok with the approach now that Peter mentions 32-bit x86 is
"deprecated". :-)

-- 
Joel Fernandes

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Boqun Feng 14 hours ago

On Fri, Feb 06, 2026 at 11:00:16AM -0500, Joel Fernandes wrote:
> 
> 
> On 2/6/2026 10:28 AM, Boqun Feng wrote:
> >> I was also coming from the goal of long term kernel code maintainability. If
> >> we decide to have additional preempt count flags in the future, does special
> >> casing 32 bit add even more complexity? (not rhetorical, really asking)
> >>
> > First, given what preempt count is, I don't think that'll happen
> > frequently.
> 
> Not sure I buy the argument of not happening frequently. I don't think any of us
> have a crystal ball. There are cases in the future that can come up IMO.
> 

It's just being realistic, and we pretty much use all the bits there. 

> > Also I think the reality is that we care about 64bit> performance more than
> 32bit, in that sense, if this "conditional 32 bit
> > preempt count case" becomes an issue, the reasonable action to me is
> > just making all preempt count 64bit 
> 
> You might be missing something here. You can't make all of preempt count 64 bit,
> that's the point, it doesn't work. That's why Peter did what he did to
> special-case 32 bit. See:
> https://lore.kernel.org/all/20251020204421.GA197647@joelbox2/
> 
> That said, I am ok with the approach now that Peter mentions 32-bit x86 is
> "deprecated". :-)
> 

Yeah, "can't" is a strong word? ;-) I did say if we care more about
performance on 64bit than 32bit and can afford slowing down 32bit
preemption disabling in the case where "we decide to have additional
preempt count flags", THEN we can make all preempt count 64bit.

Regards,
Boqun

> -- 
> Joel Fernandes
>

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Gary Guo 2 days, 18 hours ago

On Wed Feb 4, 2026 at 11:12 AM GMT, Peter Zijlstra wrote:
> On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
>> But I'm really somewhat sad that 64bit can't do better than this.
>
> Here, the below builds and boots (albeit with warnings because printf
> format crap sucks).

Hi Peter,

I am not sure if it's worth the complexity to do this for the NMI code path.
I don't think NMI code path is hot enough that this is necessary?

Best,
Gary

>
> ---
>  arch/x86/Kconfig               |  1 +
>  arch/x86/include/asm/preempt.h | 53 ++++++++++++++++++++++++++++++------------
>  arch/x86/kernel/cpu/common.c   |  2 +-
>  include/linux/hardirq.h        |  7 +++---
>  include/linux/preempt.h        | 52 ++++++++++++++++++++++++++++++++++-------
>  init/main.c                    |  2 +-
>  kernel/Kconfig.preempt         |  4 ++++
>  kernel/sched/core.c            |  8 +++----
>  kernel/softirq.c               | 10 +++++++-
>  kernel/time/timer.c            |  2 +-
>  lib/locking-selftest.c         |  2 +-
>  11 files changed, 106 insertions(+), 37 deletions(-)

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 2 days, 17 hours ago

On Wed, Feb 04, 2026 at 12:32:45PM +0000, Gary Guo wrote:
> On Wed Feb 4, 2026 at 11:12 AM GMT, Peter Zijlstra wrote:
> > On Tue, Feb 03, 2026 at 01:15:21PM +0100, Peter Zijlstra wrote:
> >> But I'm really somewhat sad that 64bit can't do better than this.
> >
> > Here, the below builds and boots (albeit with warnings because printf
> > format crap sucks).
> 
> Hi Peter,
> 
> I am not sure if it's worth the complexity to do this for the NMI code path.
> I don't think NMI code path is hot enough that this is necessary?

Perf uses NMI. Also, the 64bit code is actually simpler.

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Peter Zijlstra 3 days, 18 hours ago

On Wed, Jan 21, 2026 at 05:39:05PM -0500, Lyude Paul wrote:

> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 77198911b8dd4..af47ea23aba3b 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
>  EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
>  #endif
>  
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);

What happened with putting this in the same line as preempt_count?

Re: [PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter

Posted by Joel Fernandes 1 day, 5 hours ago


On 2/3/2026 6:44 AM, Peter Zijlstra wrote:
> On Wed, Jan 21, 2026 at 05:39:05PM -0500, Lyude Paul wrote:
> 
>> diff --git a/kernel/softirq.c b/kernel/softirq.c
>> index 77198911b8dd4..af47ea23aba3b 100644
>> --- a/kernel/softirq.c
>> +++ b/kernel/softirq.c
>> @@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
>>  EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
>>  #endif
>>  
>> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> 
> What happened with putting this in the same line as preempt_count?

I can try to do that again if we still want to go with this patch. When I tried
that last, I ran into issues I can't remember now (the space being limited?).