[RFC 6/8] x86_64/bug: Implement __WARN_printf()

Peter Zijlstra posted 8 patches 6 months, 2 weeks ago
[RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
Optimize __WARN_printf() for the common number of arguments.

By storing the WARN() format string in the bug_entry, it becomes
possible to print this from the exception handler, instead of having
to emit a __warn_printf() call before tripping the exception.

This yields more compact code, while also providing an opportunity to
simplify WARN_ONCE() -- in a future patch.

The only 'minor' detail is how to get the printf() arguments into the
exception handler.

Note that the regular calling convention uses 6 registers to pass
arguments (before pushing to the stack) and observe that UD1 has a
ModRM byte which can encode two registers.

This then suggests using UD1 exceptions for small __WARN_printf()
argument numbers, encoding the registers holding the arguments in the
UD1 instruction.

Except that, when creating a histogram of the number of arguments
given to WARN (including WARN_ONCE):

defconfig:			allyesconfig:
    899 __COUNT_printf_3           9694 __COUNT_printf_0
    795 __COUNT_printf_2           4498 __COUNT_printf_1
    360 __COUNT_printf_0           3213 __COUNT_printf_3
    212 __COUNT_printf_1           2949 __COUNT_printf_2
     86 __COUNT_printf_4            186 __COUNT_printf_4
     37 __COUNT_printf_5             84 __COUNT_printf_5
     15 __COUNT_printf_6             50 __COUNT_printf_6
     14 __COUNT_printf_7             16 __COUNT_printf_7
      3 __COUNT_printf_8              3 __COUNT_printf_8
      3 __COUNT_printf_12             3 __COUNT_printf_12
                                      2 __COUNT_printf_9

It becomes clear that supporting 3 arguments is rather critical. This
requires encoding 3 registers in an instruction, which calls for some
creativity :-)

Use the form:

  UD1	disp8(%ecx), %reg

And encode two registers (one per nibble) in the displacement value.
Use (%ecx) as base, because UBSAN already uses (%eax) to encode its
immediates.

This then yields the following encodings:

  nr args:		insn:
    0			UD2
    1			UD1	(%ecx), %reg
    2			UD1	%reg2, %reg1
    3			UD1	disp8(%ecx), %reg1

Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
into per nr_args sub-marcos, except use a custom mapping such that 4
and above map to another variadic that does the current thing as
fallback.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/bug.h |  115 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c    |  137 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 225 insertions(+), 27 deletions(-)

--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -5,6 +5,7 @@
 #include <linux/stringify.h>
 #include <linux/instrumentation.h>
 #include <linux/objtool.h>
+#include <linux/args.h>
 
 /*
  * Despite that some emulators terminate on UD2, we use it for WARN().
@@ -26,6 +27,7 @@
 #define BUG_UD2			0xfffe
 #define BUG_UD1			0xfffd
 #define BUG_UD1_UBSAN		0xfffc
+#define BUG_UD1_WARN		0xfffb
 #define BUG_EA			0xffea
 #define BUG_LOCK		0xfff0
 
@@ -108,6 +110,119 @@ do {								\
 	instrumentation_end();					\
 } while (0)
 
+#ifdef HAVE_ARCH_BUG_FORMAT
+
+#ifndef __ASSEMBLER__
+struct pt_regs;
+extern void __warn_printf(const char *fmt, struct pt_regs *regs);
+#define __warn_printf __warn_printf
+#endif
+
+#define __WARN_printf_0(flags, format)					\
+do {									\
+	__auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+	instrumentation_begin();					\
+	asm_inline volatile("1: ud2\n"					\
+			    ANNOTATE_REACHABLE(1b)			\
+	    __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+		     : : [file] "i" (__FILE__), [line] "i" (__LINE__),	\
+			 [fl] "i" (__flags), [fmt] "i" (format));	\
+	instrumentation_end();						\
+} while (0)
+
+#define __WARN_printf_1(flags, format, arg1)				\
+do {									\
+	__auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+	instrumentation_begin();					\
+	asm_inline volatile("1: ud1 (%%ecx), %[a1]\n"			\
+			    ANNOTATE_REACHABLE(1b)			\
+	    __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+		     : : [file] "i" (__FILE__), [line] "i" (__LINE__),	\
+			 [fl] "i" (__flags), [fmt] "i" (format),	\
+			 [a1] "r" ((unsigned long)(arg1)));		\
+	instrumentation_end();						\
+} while (0)
+
+#define __WARN_printf_2(flags, format, arg1, arg2)			\
+do {									\
+	__auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+	instrumentation_begin();					\
+	asm_inline volatile("1: ud1 %[a2], %[a1]\n"			\
+			    ANNOTATE_REACHABLE(1b)			\
+	    __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+		     : : [file] "i" (__FILE__), [line] "i" (__LINE__),	\
+			 [fl] "i" (__flags), [fmt] "i" (format),	\
+			 [a1] "r" ((unsigned long)(arg1)),		\
+			 [a2] "r" ((unsigned long)(arg2)));		\
+	instrumentation_end();						\
+} while (0)
+
+/*
+ * warn_add_reg var reg -- adds the machine register index to var
+ */
+#define DEFINE_WARN_REG \
+	".macro warn_add_reg var:req reg:req\n"							\
+	".set .Lfound, 0\n"									\
+	".set .Lregnr, 0\n"									\
+	".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n"		\
+	".ifc \\reg, %%\\rs\n"									\
+	".set .Lfound, .Lfound+1\n"								\
+	".set \\var, \\var + .Lregnr\n"								\
+	".endif\n"										\
+	".set .Lregnr, .Lregnr+1\n"								\
+	".endr\n"										\
+	".set .Lregnr, 0\n"									\
+	".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n"	\
+	".ifc \\reg, %%\\rs\n"									\
+	".set .Lfound, .Lfound+1\n"								\
+	".set \\var, \\var + .Lregnr\n"								\
+	".endif\n"										\
+	".set .Lregnr, .Lregnr+1\n"								\
+	".endr\n"										\
+	".if (.Lfound != 1)\n"									\
+	".error \"warn_add_reg: bad register argument\"\n"					\
+	".endif\n"										\
+	".endm\n"
+
+#define UNDEFINE_WARN_REG \
+	".purgem warn_add_reg\n"
+
+#define __WARN_printf_3(flags, format, arg1, arg2, arg3)		\
+do {									\
+	__auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+	instrumentation_begin();					\
+	asm_inline volatile(						\
+		DEFINE_WARN_REG						\
+		".set warn_imm, 0\n"					\
+		"warn_add_reg warn_imm, %[a3]\n"			\
+		".set warn_imm, (warn_imm << 4)\n"			\
+		"warn_add_reg warn_imm, %[a2]\n"			\
+		UNDEFINE_WARN_REG					\
+		"1: ud1 warn_imm(%%ecx),%[a1]\n"			\
+		ANNOTATE_REACHABLE(1b)					\
+		__stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+		     : : [file] "i" (__FILE__), [line] "i" (__LINE__),	\
+			 [fl] "i" (__flags), [fmt] "i" (format),	\
+			 [a1] "r" ((unsigned long)(arg1)),		\
+			 [a2] "r" ((unsigned long)(arg2)),		\
+			 [a3] "r" ((unsigned long)(arg3)));		\
+	instrumentation_end();						\
+} while (0)
+
+#define __WARN_printf_n(flags, fmt, arg...) do {			\
+		instrumentation_begin();				\
+		__warn_printk(fmt, arg);				\
+		__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | (flags));		\
+		instrumentation_end();					\
+	} while (0)
+
+#define WARN_ARGS(X...) __COUNT_ARGS(, ##X, n, n, n, n, n, n, n, n, n, n, n, n, 3, 2, 1, 0)
+
+#define __WARN_printf(taint, fmt, arg...) \
+	CONCATENATE(__WARN_printf_, WARN_ARGS(arg))(BUGFLAG_TAINT(taint), fmt, ## arg)
+
+#endif /* HAVE_ARCH_BUG_FORMAT */
+
 #include <asm-generic/bug.h>
 
 #endif /* _ASM_X86_BUG_H */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -81,18 +81,6 @@
 
 DECLARE_BITMAP(system_vectors, NR_VECTORS);
 
-__always_inline int is_valid_bugaddr(unsigned long addr)
-{
-	if (addr < TASK_SIZE_MAX)
-		return 0;
-
-	/*
-	 * We got #UD, if the text isn't readable we'd have gotten
-	 * a different exception.
-	 */
-	return *(unsigned short *)addr == INSN_UD2;
-}
-
 /*
  * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
  * If it's a UD1, further decode to determine its use:
@@ -103,24 +91,49 @@ __always_inline int is_valid_bugaddr(uns
  * UBSan{10}:    67 0f b9 40 10          ud1    0x10(%eax),%eax
  * static_call:  0f b9 cc                ud1    %esp,%ecx
  *
- * Notably UBSAN uses EAX, static_call uses ECX.
+ * WARN_printf_0:                        ud2
+ * WARN_printf_1:                        ud1    (%ecx),%reg
+ * WARN_printf_2:                        ud1    %reg,%reg
+ * WARN_printf_3:                        ud1    0xBA(%ecx),%reg
+ *
+ * Notably UBSAN uses (%eax), static_call uses %esp.
+ *
+ * @regs will return one register per nibble, typically ModRM reg in the low
+ * nibble and ModRM rm in the next nibble (including REX bits). In case of the
+ * WARN_printf_3 case the 8 bit immediate is used to encode two registers and
+ * a total of 3 nibbles will be set.
+ *
+ * @imm will return the immediate value encoded in the instruction, or 0.
+ *
+ * @len will return the length of the instruction decoded.
  */
-__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
+__always_inline int decode_bug(unsigned long addr, u32 *regs, s32 *imm, int *len)
 {
 	unsigned long start = addr;
+	u8 v, rex = 0, reg, rm;
 	bool lock = false;
-	u8 v;
+	int type = BUG_UD1;
 
 	if (addr < TASK_SIZE_MAX)
 		return BUG_NONE;
 
-	v = *(u8 *)(addr++);
-	if (v == INSN_ASOP)
+	for (;;) {
 		v = *(u8 *)(addr++);
 
-	if (v == INSN_LOCK) {
-		lock = true;
-		v = *(u8 *)(addr++);
+		if (v == INSN_ASOP)
+			continue;
+
+		if (v == INSN_LOCK) {
+			lock = true;
+			continue;
+		}
+
+		if ((v & 0xf0) == 0x40) {
+			rex = v;
+			continue;
+		}
+
+		break;
 	}
 
 	switch (v) {
@@ -141,6 +154,9 @@ __always_inline int decode_bug(unsigned
 		return BUG_NONE;
 	}
 
+	*regs = 0;
+	*imm = 0;
+
 	v = *(u8 *)(addr++);
 	if (v == SECOND_BYTE_OPCODE_UD2) {
 		*len = addr - start;
@@ -150,19 +166,33 @@ __always_inline int decode_bug(unsigned
 	if (v != SECOND_BYTE_OPCODE_UD1)
 		return BUG_NONE;
 
-	*imm = 0;
 	v = *(u8 *)(addr++);		/* ModRM */
-
 	if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
 		addr++;			/* SIB */
 
+	reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+	rm  = X86_MODRM_RM(v)  + 8*!!X86_REX_B(rex);
+	*regs = (rm << 4) | reg;
+
 	/* Decode immediate, if present */
 	switch (X86_MODRM_MOD(v)) {
 	case 0: if (X86_MODRM_RM(v) == 5)
-			addr += 4; /* RIP + disp32 */
+			addr += 4;	/* RIP + disp32 */
+		if (rm == 1)		/* CX */
+			type = BUG_UD1_WARN;
 		break;
 
 	case 1: *imm = *(s8 *)addr;
+		if (rm == 1) {		/* CX */
+			/*
+			 * The 8bit immediate is used to encode two more
+			 * registers, while the rm value is used to encode
+			 * this is a UD1_WARN. Munge the immediate into the
+			 * regs value such that 3 nibbles are set.
+			 */
+			*regs = ((*(u8 *)addr) << 4) | reg;
+			type = BUG_UD1_WARN;
+		}
 		addr += 1;
 		break;
 
@@ -170,18 +200,37 @@ __always_inline int decode_bug(unsigned
 		addr += 4;
 		break;
 
-	case 3: break;
+	case 3: if (rm != 4)		/* SP */
+			type = BUG_UD1_WARN;
+		break;
 	}
 
 	/* record instruction length */
 	*len = addr - start;
 
-	if (X86_MODRM_REG(v) == 0)	/* EAX */
+	if (!rm && X86_MODRM_MOD(v) != 3)	/* (%eax) */
 		return BUG_UD1_UBSAN;
 
-	return BUG_UD1;
+	return type;
 }
 
+int is_valid_bugaddr(unsigned long addr)
+{
+	int ud_type, ud_len;
+	u32 ud_regs;
+	s32 ud_imm;
+
+	if (addr < TASK_SIZE_MAX)
+		return 0;
+
+	/*
+	 * We got #UD, if the text isn't readable we'd have gotten
+	 * a different exception.
+	 */
+	ud_type = decode_bug(addr, &ud_regs, &ud_imm, &ud_len);
+
+	return ud_type == BUG_UD2 || ud_type == BUG_UD1_WARN;
+}
 
 static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
@@ -305,14 +354,42 @@ static inline void handle_invalid_op(str
 		      ILL_ILLOPN, error_get_trap_addr(regs));
 }
 
+#ifdef HAVE_ARCH_BUG_FORMAT
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
+{
+	int offset = pt_regs_offset(regs, nr);
+	if (WARN_ON_ONCE(offset < -0))
+		return 0;
+	return *((unsigned long *)((void *)regs + offset));
+}
+
+void __warn_printf(const char *fmt, struct pt_regs *regs)
+{
+	unsigned long a1, a2, a3;
+	u32 r = regs->orig_ax;
+
+	/*
+	 * @r is the ud_regs value from decode_bug() which will have at most 3
+	 * registers encoded. Notably printk() will ignore any spurious
+	 * arguments.
+	 */
+	a1 = pt_regs_val(regs, (r >> 0) & 0xf);
+	a2 = pt_regs_val(regs, (r >> 4) & 0xf);
+	a3 = pt_regs_val(regs, (r >> 8) & 0xf);
+
+	printk(fmt, a1, a2, a3);
+}
+#endif
+
 static noinstr bool handle_bug(struct pt_regs *regs)
 {
 	unsigned long addr = regs->ip;
 	bool handled = false;
 	int ud_type, ud_len;
+	u32 ud_regs;
 	s32 ud_imm;
 
-	ud_type = decode_bug(addr, &ud_imm, &ud_len);
+	ud_type = decode_bug(addr, &ud_regs, &ud_imm, &ud_len);
 	if (ud_type == BUG_NONE)
 		return handled;
 
@@ -334,7 +411,13 @@ static noinstr bool handle_bug(struct pt
 		raw_local_irq_enable();
 
 	switch (ud_type) {
+	case BUG_UD1_WARN:
 	case BUG_UD2:
+		/*
+		 * #UD does not have an error code, use orig_ax to pass the ud_regs value
+		 * to __warn_printf().
+		 */
+		regs->orig_ax = ud_regs;
 		if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
 			handled = true;
 			break;
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Linus Torvalds 6 months, 2 weeks ago
On Mon, 2 Jun 2025 at 07:52, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
> into per nr_args sub-marcos, except use a custom mapping such that 4
> and above map to another variadic that does the current thing as
> fallback.

Does this horror work with clang? Because I suspect not. The games you
play with inline asm are too disgusting for words.

But honestly, even if it does,I really hate this kind of insane
complexity for dubious reasons.

If you have a warning that is *so* critical for performance that you
can't deal with the register movement that comes from the compiler
doing this for you, just remove the warning.

Don't make our build system do something this disgusting.

                 Linus
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 08:02:24AM -0700, Linus Torvalds wrote:
> On Mon, 2 Jun 2025 at 07:52, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
> > into per nr_args sub-marcos, except use a custom mapping such that 4
> > and above map to another variadic that does the current thing as
> > fallback.
> 
> Does this horror work with clang? Because I suspect not. The games you
> play with inline asm are too disgusting for words.

Yes, it absolutely builds with clang. The inline asm isn't something we
don't already do elsewhere :-) *cough* extable *cough*

> But honestly, even if it does,I really hate this kind of insane
> complexity for dubious reasons.
> 
> If you have a warning that is *so* critical for performance that you
> can't deal with the register movement that comes from the compiler
> doing this for you, just remove the warning.
> 
> Don't make our build system do something this disgusting.

It isn't just the register movement. What we currently have for WARN()
is:

WARN(cond, fmt, arg...)

	if (unlikely(cond)) {
		__warn_printf(fmt, arg);
		ud2
	}

Where the UD2 bug_entry will have NO_CUT_HERE, because __warn_printf()
will have started the printing.

Part of the problem is with unlikely() not causing the text to break out
into .text.unlikely, but at best it moves it to the end of whatever
function we're in.

This also means that if you do WARN_ONCE() the whole ONCE machinery is
also dumped into that function.

And now someone wants to go add some KUnit specific testing to this as
well, which is also dumped into the function.

This is all cruft that shouldn't be in the normal .text.


The horrors I did will change things into:

	if (unlikely(cond))
		ud1 regs

where the bug_entry will then have the fmt, and the ud1 instruction some
regs (provided 3 or less args). Then all the ONCE and KUnit crap can
live in the exception handler. Not littered around the real code.


Now, I can still relate to: "this is too horrible for words". But then I
would strongly suggest people go poke at the compilers so we can get:

	if (really_unlikely(cond)) {
		whatever;
	}

such that the compiler is forced to split whatever into a cold
sub-function placed in .text.unlikely. Then it doesn't really matter how
much crap we stick in there, it will not affect the normal code paths /
I$.

Anyway, I had fun hacking this up :-)
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Linus Torvalds 6 months, 2 weeks ago
On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Yes, it absolutely builds with clang. The inline asm isn't something we
> don't already do elsewhere :-) *cough* extable *cough*

Eww. I hadn't looked at that (or repressed it if I did). *Shudder*.

But looking around, I don't think any of the normal code I ever look
at actually *generate* that disgusting thing.

I had to search for it, and looked at the absolute horror it generates
in the futex code, and honestly, if I ever have to look at that
garbage, I would throw up.

And WARN_ONCE() is in stuff I *do* look at.

So no. I'm NAK'ing it just because it makes the asm look entirely unreadable.

And no, I'm not ok with only using 'objdump' and friends to look at
assembly generation. I want to be able to do

   make xyz.s

and look at code generation without throwing up.

The fact that we have this disgusting thing elsewhere in places that
I've not looked at does *not* excuse adding it to other places.

                Linus
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote:

> And no, I'm not ok with only using 'objdump' and friends to look at
> assembly generation. I want to be able to do
> 
>    make xyz.s
> 
> and look at code generation without throwing up.

So if I stuff the asm macro in a global asm() block then GCC ends up
looking like so:

        .set warn_imm, 0
        warn_add_reg var=warn_imm reg=%rcx      # tmp215
        .set warn_imm, (warn_imm << 4)
        warn_add_reg var=warn_imm reg=%rdx      # tmp212
        1: ud1 warn_imm(%ecx),%rax      # tmp210
        .pushsection .discard.annotate_insn,"M",@progbits,8
        .long 1b - .
        .long 8
        .popsection
        .pushsection __bug_table, "aw" ; 123: .long 1b - . ; .long .LC76 - . ; .long .LC0 - . ; .word 8710 ; .word 2321 ; .org 123b + 6 + 4 + 6 ; .popsection   #,,,

However, clangd is 'helpful' and fully expands the asm macro for the .s
file :-(
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Linus Torvalds 6 months, 2 weeks ago
On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote:
>
> So if I stuff the asm macro in a global asm() block then GCC ends up
> looking like so:

Better, but as then the clang thing looks like a horrendous disaster.

How about we simply make this all *code* instead of playing games with
register numbers?

Why not just push the arguments by hand on the stack, and make that be
the interface? A 'push %reg' is like a byte or two. And you'd do it in
the cold section, so nobody cares.

And the asm would look somewhat sane, instead of being crazy noise due
to crazy macros.

Or so I imagine, because I didn't actually try it.

           Linus
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 04:10:16PM -0700, Linus Torvalds wrote:
> On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > So if I stuff the asm macro in a global asm() block then GCC ends up
> > looking like so:
> 
> Better, but as then the clang thing looks like a horrendous disaster.
> 
> How about we simply make this all *code* instead of playing games with
> register numbers?
> 
> Why not just push the arguments by hand on the stack, and make that be
> the interface? A 'push %reg' is like a byte or two. And you'd do it in
> the cold section, so nobody cares.
> 
> And the asm would look somewhat sane, instead of being crazy noise due
> to crazy macros.
> 
> Or so I imagine, because I didn't actually try it.

Yeah, I can make that work. 

I've been trying to make __WARN_printk() (or similar) do a tail-call to
a "UD2; RET;" stub. But doing printk() in a function makes GCC generate
wild code that refuses to actually tail-call :/

The next crazy idea was to make a variant of __WARN_printk() that takes
a struct bug_entry * as first argument such that it has access to the
bug entry and then take the trap on the way out (while keeping the
pointer in the first argument) and then have the trap handler complete
things.

That way it would all 'just work'. Except I can't seem to force GCC to
emit that tail-call :-(

I'll prod at it some more.
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by David Laight 6 months, 2 weeks ago
On Tue, 3 Jun 2025 15:04:55 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Jun 02, 2025 at 04:10:16PM -0700, Linus Torvalds wrote:
> > On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote:  
> > >
> > > So if I stuff the asm macro in a global asm() block then GCC ends up
> > > looking like so:  
> > 
> > Better, but as then the clang thing looks like a horrendous disaster.
> > 
> > How about we simply make this all *code* instead of playing games with
> > register numbers?
> > 
> > Why not just push the arguments by hand on the stack, and make that be
> > the interface? A 'push %reg' is like a byte or two. And you'd do it in
> > the cold section, so nobody cares.
> > 
> > And the asm would look somewhat sane, instead of being crazy noise due
> > to crazy macros.
> > 
> > Or so I imagine, because I didn't actually try it.  
> 
> Yeah, I can make that work. 
> 
> I've been trying to make __WARN_printk() (or similar) do a tail-call to
> a "UD2; RET;" stub. But doing printk() in a function makes GCC generate
> wild code that refuses to actually tail-call :/
> 
> The next crazy idea was to make a variant of __WARN_printk() that takes
> a struct bug_entry * as first argument such that it has access to the
> bug entry and then take the trap on the way out (while keeping the
> pointer in the first argument) and then have the trap handler complete
> things.
> 
> That way it would all 'just work'. Except I can't seem to force GCC to
> emit that tail-call :-(
> 
> I'll prod at it some more.

How about a slightly less generic macro, something that could be:
#define WARN_IF(a, op, b, msg) \
	if (unlikely((a) op (b)) { \
		printf("WARN: %s (%x " #op " %x)\n", \
			msg ? msg : "(" #a ") " #op " (" #b ")", (a), (b)); \
	}
but could just be:
	if (unlikely((a) op (b)) {
		asm(	" ud2; cmp %0, %1"
			" .asciz msg; .asciz #op"
			:: "r" (a), "r" (b));
	}
So a ud2 followed by a reg-reg compare (should be REX/D16 prefix followed
by '3[89ab] /r') and two strings (literals or addresses).
With a suitable exception table entry.

That saves the problem of a generic printf format while still giving the
values of the variables associated with the failing test (for simple tests).
It should also avoid destroying register assignment for the rest of the
function.

If gcc refuses to do a jump for the 'if (unlikely...))' try adding an 'else'
clause containing an asm comment.
I've done that in the past to convert a backwards conditional jump (predicted taken)
into a forwards jump (predicted not taken) to an unconditional jump to the
actual target.
(I had a very tight clock limit for the 'worst case' path.)

	David
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 11:57:25PM +0200, Peter Zijlstra wrote:
> On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote:
> 
> > And no, I'm not ok with only using 'objdump' and friends to look at
> > assembly generation. I want to be able to do
> > 
> >    make xyz.s
> > 
> > and look at code generation without throwing up.
> 
> So if I stuff the asm macro in a global asm() block then GCC ends up
> looking like so:
> 
>         .set warn_imm, 0
>         warn_add_reg var=warn_imm reg=%rcx      # tmp215
>         .set warn_imm, (warn_imm << 4)
>         warn_add_reg var=warn_imm reg=%rdx      # tmp212
>         1: ud1 warn_imm(%ecx),%rax      # tmp210
>         .pushsection .discard.annotate_insn,"M",@progbits,8
>         .long 1b - .
>         .long 8
>         .popsection
>         .pushsection __bug_table, "aw" ; 123: .long 1b - . ; .long .LC76 - . ; .long .LC0 - . ; .word 8710 ; .word 2321 ; .org 123b + 6 + 4 + 6 ; .popsection   #,,,
> 
> However, clangd is 'helpful' and fully expands the asm macro for the .s
> file :-(

It generates his horror show:

	#APP
.set warn_imm, 0
.set .Lregnr, 0
.set .Lregnr, 1
.set .Lregnr, 2
.set warn_imm, 2
.set .Lregnr, 3
.set .Lregnr, 4
.set .Lregnr, 5
.set .Lregnr, 6
.set .Lregnr, 7
.set .Lregnr, 8
.set .Lregnr, 9
.set .Lregnr, 10
.set .Lregnr, 11
.set .Lregnr, 12
.set .Lregnr, 13
.set .Lregnr, 14
.set .Lregnr, 15
.set .Lregnr, 16
.set .Lregnr, 0
.set .Lregnr, 1
.set .Lregnr, 2
.set .Lregnr, 3
.set .Lregnr, 4
.set .Lregnr, 5
.set .Lregnr, 6
.set .Lregnr, 7
.set .Lregnr, 8
.set .Lregnr, 9
.set .Lregnr, 10
.set .Lregnr, 11
.set .Lregnr, 12
.set .Lregnr, 13
.set .Lregnr, 14
.set .Lregnr, 15
.set .Lregnr, 16
.set warn_imm, 32
.set .Lregnr, 0
.set .Lregnr, 1
.set warn_imm, 33
.set .Lregnr, 2
.set .Lregnr, 3
.set .Lregnr, 4
.set .Lregnr, 5
.set .Lregnr, 6
.set .Lregnr, 7
.set .Lregnr, 8
.set .Lregnr, 9
.set .Lregnr, 10
.set .Lregnr, 11
.set .Lregnr, 12
.set .Lregnr, 13
.set .Lregnr, 14
.set .Lregnr, 15
.set .Lregnr, 16
.set .Lregnr, 0
.set .Lregnr, 1
.set .Lregnr, 2
.set .Lregnr, 3
.set .Lregnr, 4
.set .Lregnr, 5
.set .Lregnr, 6
.set .Lregnr, 7
.set .Lregnr, 8
.set .Lregnr, 9
.set .Lregnr, 10
.set .Lregnr, 11
.set .Lregnr, 12
.set .Lregnr, 13
.set .Lregnr, 14
.set .Lregnr, 15
.set .Lregnr, 16
.Ltmp1656:
	ud1q	33(%ecx), %rax
	.section	.discard.annotate_insn,"M",@progbits,8
.Ltmp1657:
	.long	.Ltmp1656-.Ltmp1657
	.long	8
	.section	.init.text,"ax",@progbits

	.section	__bug_table,"aw",@progbits
.Ltmp1658:
.Ltmp1659:
	.long	.Ltmp1656-.Ltmp1659
.Ltmp1660:
	.long	.L.str.29-.Ltmp1660
.Ltmp1661:
	.long	.L.str-.Ltmp1661
	.short	8710
	.short	2321
.org ((.Ltmp1658+6)+4)+6, 0
	.section	.init.text,"ax",@progbits


	#NO_APP
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Peter Zijlstra 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote:
> On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Yes, it absolutely builds with clang. The inline asm isn't something we
> > don't already do elsewhere :-) *cough* extable *cough*
> 
> Eww. I hadn't looked at that (or repressed it if I did). *Shudder*.
> 
> But looking around, I don't think any of the normal code I ever look
> at actually *generate* that disgusting thing.
> 
> I had to search for it, and looked at the absolute horror it generates
> in the futex code, and honestly, if I ever have to look at that
> garbage, I would throw up.
> 
> And WARN_ONCE() is in stuff I *do* look at.
> 
> So no. I'm NAK'ing it just because it makes the asm look entirely unreadable.
> 
> And no, I'm not ok with only using 'objdump' and friends to look at
> assembly generation. I want to be able to do
> 
>    make xyz.s
> 
> and look at code generation without throwing up.
> 
> The fact that we have this disgusting thing elsewhere in places that
> I've not looked at does *not* excuse adding it to other places.

Right. So the problem is using asm macros in inline-asm. We've tried
adding those macros to a global asm, but IIRC that had trouble.

So yeah, now we do the macro definition and purge right around the
inline asm and then you get this horror show :-(

Anyway, I'll try and come up with something else.
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Josh Poimboeuf 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 08:09:22PM +0200, Peter Zijlstra wrote:
> Right. So the problem is using asm macros in inline-asm. We've tried
> adding those macros to a global asm, but IIRC that had trouble.

Yeah, IIRC, the problem I ran into was that clang doesn't allow defining
asm macros in global asm().

-- 
Josh
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Andrew Cooper 6 months, 2 weeks ago
> Yeah, IIRC, the problem I ran into was that clang doesn't allow defining
> asm macros in global asm().

Macros in global asm was fixed in Clang 6. 
https://bugs.llvm.org/show_bug.cgi?id=36110

But https://github.com/llvm/llvm-project/issues/60792 is still unfixed
and waiting to trip people up.

~Andrew
Re: [RFC 6/8] x86_64/bug: Implement __WARN_printf()
Posted by Josh Poimboeuf 6 months, 2 weeks ago
On Mon, Jun 02, 2025 at 08:09:22PM +0200, Peter Zijlstra wrote:
> On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote:
> > On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Yes, it absolutely builds with clang. The inline asm isn't something we
> > > don't already do elsewhere :-) *cough* extable *cough*
> > 
> > Eww. I hadn't looked at that (or repressed it if I did). *Shudder*.
> > 
> > But looking around, I don't think any of the normal code I ever look
> > at actually *generate* that disgusting thing.
> > 
> > I had to search for it, and looked at the absolute horror it generates
> > in the futex code, and honestly, if I ever have to look at that
> > garbage, I would throw up.
> > 
> > And WARN_ONCE() is in stuff I *do* look at.
> > 
> > So no. I'm NAK'ing it just because it makes the asm look entirely unreadable.
> > 
> > And no, I'm not ok with only using 'objdump' and friends to look at
> > assembly generation. I want to be able to do
> > 
> >    make xyz.s
> > 
> > and look at code generation without throwing up.
> > 
> > The fact that we have this disgusting thing elsewhere in places that
> > I've not looked at does *not* excuse adding it to other places.
> 
> Right. So the problem is using asm macros in inline-asm. We've tried
> adding those macros to a global asm, but IIRC that had trouble.
> 
> So yeah, now we do the macro definition and purge right around the
> inline asm and then you get this horror show :-(
> 
> Anyway, I'll try and come up with something else.

At least the purge could be skipped if you check a global assembler
variable before defining the macro.

For example here was one of my experiments trying to making alternatives
more readable:

  #define DEFINE_ALT_MACRO							\
	".ifndef alt; "								\
	".set alt, 1; "								\
	__stringify(ALTERNATIVE_MACRO) "; "					\
	".endif\n\n\t"								\

  #define __ALTERNATIVE(oldinstr, newinstr, ft_flags, ft_flags_str)		\
	"\n# <ALTERNATIVE>\n"							\
	DEFINE_ALT_MACRO							\
	"alternative \"" oldinstr "\", \""					\
			 newinstr "\", \""					\
			 __stringify(ft_flags) "\" /* " ft_flags_str " */"	\
	"\n# </ALTERNATIVE>\n"

which produced code like:

  # <ALTERNATIVE>
  .ifndef alt; .set alt, 1; .macro __alt_replace orig, orig_end, newinstr, ft_flags, args:vararg; .skip -(((.Lnew_end_\@ - .Lnew_\@) - (\orig_end - \orig)) > 0) * ((.Lnew_end_\@ - .Lnew_\@) - (\orig_end - \orig)), 0x90; .Lskip_end_\@: .pushsection .altinstr_replacement, "ax"; .Lnew_\@: \newinstr; .Lnew_end_\@: .popsection; .pushsection .altinstructions, "a"; .long \orig - .; .long .Lnew_\@ - .; .4byte \ft_flags; .byte .Lskip_end_\@ - \orig; .byte .Lnew_end_\@ - .Lnew_\@; .popsection; .ifnb \args; __alt_replace \orig, \orig_end, \args; .endif; .endm; .macro alternative oldinstr, args:vararg; .Lorig_\@: \oldinstr; .Lorig_end_\@: __alt_replace .Lorig_\@, .Lorig_end_\@, \args; .endm; .endif

	alternative "", "stac", "( 9*32+20)" /* X86_FEATURE_SMAP */
  # </ALTERNATIVE>

which is much more readable if you ignore the horrendous first line ;-)

-- 
Josh