Optimize __WARN_printf() for the common number of arguments.
By storing the WARN() format string in the bug_entry, it becomes
possible to print this from the exception handler, instead of having
to emit a __warn_printf() call before tripping the exception.
This yields more compact code, while also providing an opportunity to
simplify WARN_ONCE() -- in a future patch.
The only 'minor' detail is how to get the printf() arguments into the
exception handler.
Note that the regular calling convention uses 6 registers to pass
arguments (before pushing to the stack) and observe that UD1 has a
ModRM byte which can encode two registers.
This then suggests using UD1 exceptions for small __WARN_printf()
argument numbers, encoding the registers holding the arguments in the
UD1 instruction.
Except that, when creating a histogram of the number of arguments
given to WARN (including WARN_ONCE):
defconfig: allyesconfig:
899 __COUNT_printf_3 9694 __COUNT_printf_0
795 __COUNT_printf_2 4498 __COUNT_printf_1
360 __COUNT_printf_0 3213 __COUNT_printf_3
212 __COUNT_printf_1 2949 __COUNT_printf_2
86 __COUNT_printf_4 186 __COUNT_printf_4
37 __COUNT_printf_5 84 __COUNT_printf_5
15 __COUNT_printf_6 50 __COUNT_printf_6
14 __COUNT_printf_7 16 __COUNT_printf_7
3 __COUNT_printf_8 3 __COUNT_printf_8
3 __COUNT_printf_12 3 __COUNT_printf_12
2 __COUNT_printf_9
It becomes clear that supporting 3 arguments is rather critical. This
requires encoding 3 registers in an instruction, which calls for some
creativity :-)
Use the form:
UD1 disp8(%ecx), %reg
And encode two registers (one per nibble) in the displacement value.
Use (%ecx) as base, because UBSAN already uses (%eax) to encode its
immediates.
This then yields the following encodings:
nr args: insn:
0 UD2
1 UD1 (%ecx), %reg
2 UD1 %reg2, %reg1
3 UD1 disp8(%ecx), %reg1
Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
into per nr_args sub-marcos, except use a custom mapping such that 4
and above map to another variadic that does the current thing as
fallback.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/include/asm/bug.h | 115 +++++++++++++++++++++++++++++++++++++
arch/x86/kernel/traps.c | 137 ++++++++++++++++++++++++++++++++++++---------
2 files changed, 225 insertions(+), 27 deletions(-)
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -5,6 +5,7 @@
#include <linux/stringify.h>
#include <linux/instrumentation.h>
#include <linux/objtool.h>
+#include <linux/args.h>
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
@@ -26,6 +27,7 @@
#define BUG_UD2 0xfffe
#define BUG_UD1 0xfffd
#define BUG_UD1_UBSAN 0xfffc
+#define BUG_UD1_WARN 0xfffb
#define BUG_EA 0xffea
#define BUG_LOCK 0xfff0
@@ -108,6 +110,119 @@ do { \
instrumentation_end(); \
} while (0)
+#ifdef HAVE_ARCH_BUG_FORMAT
+
+#ifndef __ASSEMBLER__
+struct pt_regs;
+extern void __warn_printf(const char *fmt, struct pt_regs *regs);
+#define __warn_printf __warn_printf
+#endif
+
+#define __WARN_printf_0(flags, format) \
+do { \
+ __auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+ instrumentation_begin(); \
+ asm_inline volatile("1: ud2\n" \
+ ANNOTATE_REACHABLE(1b) \
+ __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+ : : [file] "i" (__FILE__), [line] "i" (__LINE__), \
+ [fl] "i" (__flags), [fmt] "i" (format)); \
+ instrumentation_end(); \
+} while (0)
+
+#define __WARN_printf_1(flags, format, arg1) \
+do { \
+ __auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+ instrumentation_begin(); \
+ asm_inline volatile("1: ud1 (%%ecx), %[a1]\n" \
+ ANNOTATE_REACHABLE(1b) \
+ __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+ : : [file] "i" (__FILE__), [line] "i" (__LINE__), \
+ [fl] "i" (__flags), [fmt] "i" (format), \
+ [a1] "r" ((unsigned long)(arg1))); \
+ instrumentation_end(); \
+} while (0)
+
+#define __WARN_printf_2(flags, format, arg1, arg2) \
+do { \
+ __auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+ instrumentation_begin(); \
+ asm_inline volatile("1: ud1 %[a2], %[a1]\n" \
+ ANNOTATE_REACHABLE(1b) \
+ __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+ : : [file] "i" (__FILE__), [line] "i" (__LINE__), \
+ [fl] "i" (__flags), [fmt] "i" (format), \
+ [a1] "r" ((unsigned long)(arg1)), \
+ [a2] "r" ((unsigned long)(arg2))); \
+ instrumentation_end(); \
+} while (0)
+
+/*
+ * warn_add_reg var reg -- adds the machine register index to var
+ */
+#define DEFINE_WARN_REG \
+ ".macro warn_add_reg var:req reg:req\n" \
+ ".set .Lfound, 0\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".set \\var, \\var + .Lregnr\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".set \\var, \\var + .Lregnr\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".if (.Lfound != 1)\n" \
+ ".error \"warn_add_reg: bad register argument\"\n" \
+ ".endif\n" \
+ ".endm\n"
+
+#define UNDEFINE_WARN_REG \
+ ".purgem warn_add_reg\n"
+
+#define __WARN_printf_3(flags, format, arg1, arg2, arg3) \
+do { \
+ __auto_type __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_FORMAT; \
+ instrumentation_begin(); \
+ asm_inline volatile( \
+ DEFINE_WARN_REG \
+ ".set warn_imm, 0\n" \
+ "warn_add_reg warn_imm, %[a3]\n" \
+ ".set warn_imm, (warn_imm << 4)\n" \
+ "warn_add_reg warn_imm, %[a2]\n" \
+ UNDEFINE_WARN_REG \
+ "1: ud1 warn_imm(%%ecx),%[a1]\n" \
+ ANNOTATE_REACHABLE(1b) \
+ __stringify(ASM_BUGTABLE_FLAGS(1b, %c[fmt], %c[file], %c[line], %c[fl])) "\n" \
+ : : [file] "i" (__FILE__), [line] "i" (__LINE__), \
+ [fl] "i" (__flags), [fmt] "i" (format), \
+ [a1] "r" ((unsigned long)(arg1)), \
+ [a2] "r" ((unsigned long)(arg2)), \
+ [a3] "r" ((unsigned long)(arg3))); \
+ instrumentation_end(); \
+} while (0)
+
+#define __WARN_printf_n(flags, fmt, arg...) do { \
+ instrumentation_begin(); \
+ __warn_printk(fmt, arg); \
+ __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | (flags)); \
+ instrumentation_end(); \
+ } while (0)
+
+#define WARN_ARGS(X...) __COUNT_ARGS(, ##X, n, n, n, n, n, n, n, n, n, n, n, n, 3, 2, 1, 0)
+
+#define __WARN_printf(taint, fmt, arg...) \
+ CONCATENATE(__WARN_printf_, WARN_ARGS(arg))(BUGFLAG_TAINT(taint), fmt, ## arg)
+
+#endif /* HAVE_ARCH_BUG_FORMAT */
+
#include <asm-generic/bug.h>
#endif /* _ASM_X86_BUG_H */
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -81,18 +81,6 @@
DECLARE_BITMAP(system_vectors, NR_VECTORS);
-__always_inline int is_valid_bugaddr(unsigned long addr)
-{
- if (addr < TASK_SIZE_MAX)
- return 0;
-
- /*
- * We got #UD, if the text isn't readable we'd have gotten
- * a different exception.
- */
- return *(unsigned short *)addr == INSN_UD2;
-}
-
/*
* Check for UD1 or UD2, accounting for Address Size Override Prefixes.
* If it's a UD1, further decode to determine its use:
@@ -103,24 +91,49 @@ __always_inline int is_valid_bugaddr(uns
* UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
* static_call: 0f b9 cc ud1 %esp,%ecx
*
- * Notably UBSAN uses EAX, static_call uses ECX.
+ * WARN_printf_0: ud2
+ * WARN_printf_1: ud1 (%ecx),%reg
+ * WARN_printf_2: ud1 %reg,%reg
+ * WARN_printf_3: ud1 0xBA(%ecx),%reg
+ *
+ * Notably UBSAN uses (%eax), static_call uses %esp.
+ *
+ * @regs will return one register per nibble, typically ModRM reg in the low
+ * nibble and ModRM rm in the next nibble (including REX bits). In case of the
+ * WARN_printf_3 case the 8 bit immediate is used to encode two registers and
+ * a total of 3 nibbles will be set.
+ *
+ * @imm will return the immediate value encoded in the instruction, or 0.
+ *
+ * @len will return the length of the instruction decoded.
*/
-__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
+__always_inline int decode_bug(unsigned long addr, u32 *regs, s32 *imm, int *len)
{
unsigned long start = addr;
+ u8 v, rex = 0, reg, rm;
bool lock = false;
- u8 v;
+ int type = BUG_UD1;
if (addr < TASK_SIZE_MAX)
return BUG_NONE;
- v = *(u8 *)(addr++);
- if (v == INSN_ASOP)
+ for (;;) {
v = *(u8 *)(addr++);
- if (v == INSN_LOCK) {
- lock = true;
- v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
+
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
+
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
+
+ break;
}
switch (v) {
@@ -141,6 +154,9 @@ __always_inline int decode_bug(unsigned
return BUG_NONE;
}
+ *regs = 0;
+ *imm = 0;
+
v = *(u8 *)(addr++);
if (v == SECOND_BYTE_OPCODE_UD2) {
*len = addr - start;
@@ -150,19 +166,33 @@ __always_inline int decode_bug(unsigned
if (v != SECOND_BYTE_OPCODE_UD1)
return BUG_NONE;
- *imm = 0;
v = *(u8 *)(addr++); /* ModRM */
-
if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
addr++; /* SIB */
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+ *regs = (rm << 4) | reg;
+
/* Decode immediate, if present */
switch (X86_MODRM_MOD(v)) {
case 0: if (X86_MODRM_RM(v) == 5)
- addr += 4; /* RIP + disp32 */
+ addr += 4; /* RIP + disp32 */
+ if (rm == 1) /* CX */
+ type = BUG_UD1_WARN;
break;
case 1: *imm = *(s8 *)addr;
+ if (rm == 1) { /* CX */
+ /*
+ * The 8bit immediate is used to encode two more
+ * registers, while the rm value is used to encode
+ * this is a UD1_WARN. Munge the immediate into the
+ * regs value such that 3 nibbles are set.
+ */
+ *regs = ((*(u8 *)addr) << 4) | reg;
+ type = BUG_UD1_WARN;
+ }
addr += 1;
break;
@@ -170,18 +200,37 @@ __always_inline int decode_bug(unsigned
addr += 4;
break;
- case 3: break;
+ case 3: if (rm != 4) /* SP */
+ type = BUG_UD1_WARN;
+ break;
}
/* record instruction length */
*len = addr - start;
- if (X86_MODRM_REG(v) == 0) /* EAX */
+ if (!rm && X86_MODRM_MOD(v) != 3) /* (%eax) */
return BUG_UD1_UBSAN;
- return BUG_UD1;
+ return type;
}
+int is_valid_bugaddr(unsigned long addr)
+{
+ int ud_type, ud_len;
+ u32 ud_regs;
+ s32 ud_imm;
+
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ /*
+ * We got #UD, if the text isn't readable we'd have gotten
+ * a different exception.
+ */
+ ud_type = decode_bug(addr, &ud_regs, &ud_imm, &ud_len);
+
+ return ud_type == BUG_UD2 || ud_type == BUG_UD1_WARN;
+}
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
@@ -305,14 +354,42 @@ static inline void handle_invalid_op(str
ILL_ILLOPN, error_get_trap_addr(regs));
}
+#ifdef HAVE_ARCH_BUG_FORMAT
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
+{
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
+}
+
+void __warn_printf(const char *fmt, struct pt_regs *regs)
+{
+ unsigned long a1, a2, a3;
+ u32 r = regs->orig_ax;
+
+ /*
+ * @r is the ud_regs value from decode_bug() which will have at most 3
+ * registers encoded. Notably printk() will ignore any spurious
+ * arguments.
+ */
+ a1 = pt_regs_val(regs, (r >> 0) & 0xf);
+ a2 = pt_regs_val(regs, (r >> 4) & 0xf);
+ a3 = pt_regs_val(regs, (r >> 8) & 0xf);
+
+ printk(fmt, a1, a2, a3);
+}
+#endif
+
static noinstr bool handle_bug(struct pt_regs *regs)
{
unsigned long addr = regs->ip;
bool handled = false;
int ud_type, ud_len;
+ u32 ud_regs;
s32 ud_imm;
- ud_type = decode_bug(addr, &ud_imm, &ud_len);
+ ud_type = decode_bug(addr, &ud_regs, &ud_imm, &ud_len);
if (ud_type == BUG_NONE)
return handled;
@@ -334,7 +411,13 @@ static noinstr bool handle_bug(struct pt
raw_local_irq_enable();
switch (ud_type) {
+ case BUG_UD1_WARN:
case BUG_UD2:
+ /*
+ * #UD does not have an error code, use orig_ax to pass the ud_regs value
+ * to __warn_printf().
+ */
+ regs->orig_ax = ud_regs;
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
break;
On Mon, 2 Jun 2025 at 07:52, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
> into per nr_args sub-marcos, except use a custom mapping such that 4
> and above map to another variadic that does the current thing as
> fallback.
Does this horror work with clang? Because I suspect not. The games you
play with inline asm are too disgusting for words.
But honestly, even if it does,I really hate this kind of insane
complexity for dubious reasons.
If you have a warning that is *so* critical for performance that you
can't deal with the register movement that comes from the compiler
doing this for you, just remove the warning.
Don't make our build system do something this disgusting.
Linus
On Mon, Jun 02, 2025 at 08:02:24AM -0700, Linus Torvalds wrote:
> On Mon, 2 Jun 2025 at 07:52, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Use the normal COUNT_ARGS() trick to split the variadic WARN() macro
> > into per nr_args sub-marcos, except use a custom mapping such that 4
> > and above map to another variadic that does the current thing as
> > fallback.
>
> Does this horror work with clang? Because I suspect not. The games you
> play with inline asm are too disgusting for words.
Yes, it absolutely builds with clang. The inline asm isn't something we
don't already do elsewhere :-) *cough* extable *cough*
> But honestly, even if it does,I really hate this kind of insane
> complexity for dubious reasons.
>
> If you have a warning that is *so* critical for performance that you
> can't deal with the register movement that comes from the compiler
> doing this for you, just remove the warning.
>
> Don't make our build system do something this disgusting.
It isn't just the register movement. What we currently have for WARN()
is:
WARN(cond, fmt, arg...)
if (unlikely(cond)) {
__warn_printf(fmt, arg);
ud2
}
Where the UD2 bug_entry will have NO_CUT_HERE, because __warn_printf()
will have started the printing.
Part of the problem is with unlikely() not causing the text to break out
into .text.unlikely, but at best it moves it to the end of whatever
function we're in.
This also means that if you do WARN_ONCE() the whole ONCE machinery is
also dumped into that function.
And now someone wants to go add some KUnit specific testing to this as
well, which is also dumped into the function.
This is all cruft that shouldn't be in the normal .text.
The horrors I did will change things into:
if (unlikely(cond))
ud1 regs
where the bug_entry will then have the fmt, and the ud1 instruction some
regs (provided 3 or less args). Then all the ONCE and KUnit crap can
live in the exception handler. Not littered around the real code.
Now, I can still relate to: "this is too horrible for words". But then I
would strongly suggest people go poke at the compilers so we can get:
if (really_unlikely(cond)) {
whatever;
}
such that the compiler is forced to split whatever into a cold
sub-function placed in .text.unlikely. Then it doesn't really matter how
much crap we stick in there, it will not affect the normal code paths /
I$.
Anyway, I had fun hacking this up :-)
On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Yes, it absolutely builds with clang. The inline asm isn't something we
> don't already do elsewhere :-) *cough* extable *cough*
Eww. I hadn't looked at that (or repressed it if I did). *Shudder*.
But looking around, I don't think any of the normal code I ever look
at actually *generate* that disgusting thing.
I had to search for it, and looked at the absolute horror it generates
in the futex code, and honestly, if I ever have to look at that
garbage, I would throw up.
And WARN_ONCE() is in stuff I *do* look at.
So no. I'm NAK'ing it just because it makes the asm look entirely unreadable.
And no, I'm not ok with only using 'objdump' and friends to look at
assembly generation. I want to be able to do
make xyz.s
and look at code generation without throwing up.
The fact that we have this disgusting thing elsewhere in places that
I've not looked at does *not* excuse adding it to other places.
Linus
On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote:
> And no, I'm not ok with only using 'objdump' and friends to look at
> assembly generation. I want to be able to do
>
> make xyz.s
>
> and look at code generation without throwing up.
So if I stuff the asm macro in a global asm() block then GCC ends up
looking like so:
.set warn_imm, 0
warn_add_reg var=warn_imm reg=%rcx # tmp215
.set warn_imm, (warn_imm << 4)
warn_add_reg var=warn_imm reg=%rdx # tmp212
1: ud1 warn_imm(%ecx),%rax # tmp210
.pushsection .discard.annotate_insn,"M",@progbits,8
.long 1b - .
.long 8
.popsection
.pushsection __bug_table, "aw" ; 123: .long 1b - . ; .long .LC76 - . ; .long .LC0 - . ; .word 8710 ; .word 2321 ; .org 123b + 6 + 4 + 6 ; .popsection #,,,
However, clangd is 'helpful' and fully expands the asm macro for the .s
file :-(
On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote:
>
> So if I stuff the asm macro in a global asm() block then GCC ends up
> looking like so:
Better, but as then the clang thing looks like a horrendous disaster.
How about we simply make this all *code* instead of playing games with
register numbers?
Why not just push the arguments by hand on the stack, and make that be
the interface? A 'push %reg' is like a byte or two. And you'd do it in
the cold section, so nobody cares.
And the asm would look somewhat sane, instead of being crazy noise due
to crazy macros.
Or so I imagine, because I didn't actually try it.
Linus
On Mon, Jun 02, 2025 at 04:10:16PM -0700, Linus Torvalds wrote: > On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote: > > > > So if I stuff the asm macro in a global asm() block then GCC ends up > > looking like so: > > Better, but as then the clang thing looks like a horrendous disaster. > > How about we simply make this all *code* instead of playing games with > register numbers? > > Why not just push the arguments by hand on the stack, and make that be > the interface? A 'push %reg' is like a byte or two. And you'd do it in > the cold section, so nobody cares. > > And the asm would look somewhat sane, instead of being crazy noise due > to crazy macros. > > Or so I imagine, because I didn't actually try it. Yeah, I can make that work. I've been trying to make __WARN_printk() (or similar) do a tail-call to a "UD2; RET;" stub. But doing printk() in a function makes GCC generate wild code that refuses to actually tail-call :/ The next crazy idea was to make a variant of __WARN_printk() that takes a struct bug_entry * as first argument such that it has access to the bug entry and then take the trap on the way out (while keeping the pointer in the first argument) and then have the trap handler complete things. That way it would all 'just work'. Except I can't seem to force GCC to emit that tail-call :-( I'll prod at it some more.
On Tue, 3 Jun 2025 15:04:55 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, Jun 02, 2025 at 04:10:16PM -0700, Linus Torvalds wrote:
> > On Mon, 2 Jun 2025 at 14:57, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > So if I stuff the asm macro in a global asm() block then GCC ends up
> > > looking like so:
> >
> > Better, but as then the clang thing looks like a horrendous disaster.
> >
> > How about we simply make this all *code* instead of playing games with
> > register numbers?
> >
> > Why not just push the arguments by hand on the stack, and make that be
> > the interface? A 'push %reg' is like a byte or two. And you'd do it in
> > the cold section, so nobody cares.
> >
> > And the asm would look somewhat sane, instead of being crazy noise due
> > to crazy macros.
> >
> > Or so I imagine, because I didn't actually try it.
>
> Yeah, I can make that work.
>
> I've been trying to make __WARN_printk() (or similar) do a tail-call to
> a "UD2; RET;" stub. But doing printk() in a function makes GCC generate
> wild code that refuses to actually tail-call :/
>
> The next crazy idea was to make a variant of __WARN_printk() that takes
> a struct bug_entry * as first argument such that it has access to the
> bug entry and then take the trap on the way out (while keeping the
> pointer in the first argument) and then have the trap handler complete
> things.
>
> That way it would all 'just work'. Except I can't seem to force GCC to
> emit that tail-call :-(
>
> I'll prod at it some more.
How about a slightly less generic macro, something that could be:
#define WARN_IF(a, op, b, msg) \
if (unlikely((a) op (b)) { \
printf("WARN: %s (%x " #op " %x)\n", \
msg ? msg : "(" #a ") " #op " (" #b ")", (a), (b)); \
}
but could just be:
if (unlikely((a) op (b)) {
asm( " ud2; cmp %0, %1"
" .asciz msg; .asciz #op"
:: "r" (a), "r" (b));
}
So a ud2 followed by a reg-reg compare (should be REX/D16 prefix followed
by '3[89ab] /r') and two strings (literals or addresses).
With a suitable exception table entry.
That saves the problem of a generic printf format while still giving the
values of the variables associated with the failing test (for simple tests).
It should also avoid destroying register assignment for the rest of the
function.
If gcc refuses to do a jump for the 'if (unlikely...))' try adding an 'else'
clause containing an asm comment.
I've done that in the past to convert a backwards conditional jump (predicted taken)
into a forwards jump (predicted not taken) to an unconditional jump to the
actual target.
(I had a very tight clock limit for the 'worst case' path.)
David
On Mon, Jun 02, 2025 at 11:57:25PM +0200, Peter Zijlstra wrote: > On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote: > > > And no, I'm not ok with only using 'objdump' and friends to look at > > assembly generation. I want to be able to do > > > > make xyz.s > > > > and look at code generation without throwing up. > > So if I stuff the asm macro in a global asm() block then GCC ends up > looking like so: > > .set warn_imm, 0 > warn_add_reg var=warn_imm reg=%rcx # tmp215 > .set warn_imm, (warn_imm << 4) > warn_add_reg var=warn_imm reg=%rdx # tmp212 > 1: ud1 warn_imm(%ecx),%rax # tmp210 > .pushsection .discard.annotate_insn,"M",@progbits,8 > .long 1b - . > .long 8 > .popsection > .pushsection __bug_table, "aw" ; 123: .long 1b - . ; .long .LC76 - . ; .long .LC0 - . ; .word 8710 ; .word 2321 ; .org 123b + 6 + 4 + 6 ; .popsection #,,, > > However, clangd is 'helpful' and fully expands the asm macro for the .s > file :-( It generates his horror show: #APP .set warn_imm, 0 .set .Lregnr, 0 .set .Lregnr, 1 .set .Lregnr, 2 .set warn_imm, 2 .set .Lregnr, 3 .set .Lregnr, 4 .set .Lregnr, 5 .set .Lregnr, 6 .set .Lregnr, 7 .set .Lregnr, 8 .set .Lregnr, 9 .set .Lregnr, 10 .set .Lregnr, 11 .set .Lregnr, 12 .set .Lregnr, 13 .set .Lregnr, 14 .set .Lregnr, 15 .set .Lregnr, 16 .set .Lregnr, 0 .set .Lregnr, 1 .set .Lregnr, 2 .set .Lregnr, 3 .set .Lregnr, 4 .set .Lregnr, 5 .set .Lregnr, 6 .set .Lregnr, 7 .set .Lregnr, 8 .set .Lregnr, 9 .set .Lregnr, 10 .set .Lregnr, 11 .set .Lregnr, 12 .set .Lregnr, 13 .set .Lregnr, 14 .set .Lregnr, 15 .set .Lregnr, 16 .set warn_imm, 32 .set .Lregnr, 0 .set .Lregnr, 1 .set warn_imm, 33 .set .Lregnr, 2 .set .Lregnr, 3 .set .Lregnr, 4 .set .Lregnr, 5 .set .Lregnr, 6 .set .Lregnr, 7 .set .Lregnr, 8 .set .Lregnr, 9 .set .Lregnr, 10 .set .Lregnr, 11 .set .Lregnr, 12 .set .Lregnr, 13 .set .Lregnr, 14 .set .Lregnr, 15 .set .Lregnr, 16 .set .Lregnr, 0 .set .Lregnr, 1 .set .Lregnr, 2 .set .Lregnr, 3 .set .Lregnr, 4 .set .Lregnr, 5 .set .Lregnr, 6 .set .Lregnr, 7 .set .Lregnr, 8 .set .Lregnr, 9 .set .Lregnr, 10 .set .Lregnr, 11 .set .Lregnr, 12 .set .Lregnr, 13 .set .Lregnr, 14 .set .Lregnr, 15 .set .Lregnr, 16 .Ltmp1656: ud1q 33(%ecx), %rax .section .discard.annotate_insn,"M",@progbits,8 .Ltmp1657: .long .Ltmp1656-.Ltmp1657 .long 8 .section .init.text,"ax",@progbits .section __bug_table,"aw",@progbits .Ltmp1658: .Ltmp1659: .long .Ltmp1656-.Ltmp1659 .Ltmp1660: .long .L.str.29-.Ltmp1660 .Ltmp1661: .long .L.str-.Ltmp1661 .short 8710 .short 2321 .org ((.Ltmp1658+6)+4)+6, 0 .section .init.text,"ax",@progbits #NO_APP
On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote: > On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote: > > > > Yes, it absolutely builds with clang. The inline asm isn't something we > > don't already do elsewhere :-) *cough* extable *cough* > > Eww. I hadn't looked at that (or repressed it if I did). *Shudder*. > > But looking around, I don't think any of the normal code I ever look > at actually *generate* that disgusting thing. > > I had to search for it, and looked at the absolute horror it generates > in the futex code, and honestly, if I ever have to look at that > garbage, I would throw up. > > And WARN_ONCE() is in stuff I *do* look at. > > So no. I'm NAK'ing it just because it makes the asm look entirely unreadable. > > And no, I'm not ok with only using 'objdump' and friends to look at > assembly generation. I want to be able to do > > make xyz.s > > and look at code generation without throwing up. > > The fact that we have this disgusting thing elsewhere in places that > I've not looked at does *not* excuse adding it to other places. Right. So the problem is using asm macros in inline-asm. We've tried adding those macros to a global asm, but IIRC that had trouble. So yeah, now we do the macro definition and purge right around the inline asm and then you get this horror show :-( Anyway, I'll try and come up with something else.
On Mon, Jun 02, 2025 at 08:09:22PM +0200, Peter Zijlstra wrote: > Right. So the problem is using asm macros in inline-asm. We've tried > adding those macros to a global asm, but IIRC that had trouble. Yeah, IIRC, the problem I ran into was that clang doesn't allow defining asm macros in global asm(). -- Josh
> Yeah, IIRC, the problem I ran into was that clang doesn't allow defining > asm macros in global asm(). Macros in global asm was fixed in Clang 6. https://bugs.llvm.org/show_bug.cgi?id=36110 But https://github.com/llvm/llvm-project/issues/60792 is still unfixed and waiting to trip people up. ~Andrew
On Mon, Jun 02, 2025 at 08:09:22PM +0200, Peter Zijlstra wrote: > On Mon, Jun 02, 2025 at 09:38:09AM -0700, Linus Torvalds wrote: > > On Mon, 2 Jun 2025 at 08:50, Peter Zijlstra <peterz@infradead.org> wrote: > > > > > > Yes, it absolutely builds with clang. The inline asm isn't something we > > > don't already do elsewhere :-) *cough* extable *cough* > > > > Eww. I hadn't looked at that (or repressed it if I did). *Shudder*. > > > > But looking around, I don't think any of the normal code I ever look > > at actually *generate* that disgusting thing. > > > > I had to search for it, and looked at the absolute horror it generates > > in the futex code, and honestly, if I ever have to look at that > > garbage, I would throw up. > > > > And WARN_ONCE() is in stuff I *do* look at. > > > > So no. I'm NAK'ing it just because it makes the asm look entirely unreadable. > > > > And no, I'm not ok with only using 'objdump' and friends to look at > > assembly generation. I want to be able to do > > > > make xyz.s > > > > and look at code generation without throwing up. > > > > The fact that we have this disgusting thing elsewhere in places that > > I've not looked at does *not* excuse adding it to other places. > > Right. So the problem is using asm macros in inline-asm. We've tried > adding those macros to a global asm, but IIRC that had trouble. > > So yeah, now we do the macro definition and purge right around the > inline asm and then you get this horror show :-( > > Anyway, I'll try and come up with something else. At least the purge could be skipped if you check a global assembler variable before defining the macro. For example here was one of my experiments trying to making alternatives more readable: #define DEFINE_ALT_MACRO \ ".ifndef alt; " \ ".set alt, 1; " \ __stringify(ALTERNATIVE_MACRO) "; " \ ".endif\n\n\t" \ #define __ALTERNATIVE(oldinstr, newinstr, ft_flags, ft_flags_str) \ "\n# <ALTERNATIVE>\n" \ DEFINE_ALT_MACRO \ "alternative \"" oldinstr "\", \"" \ newinstr "\", \"" \ __stringify(ft_flags) "\" /* " ft_flags_str " */" \ "\n# </ALTERNATIVE>\n" which produced code like: # <ALTERNATIVE> .ifndef alt; .set alt, 1; .macro __alt_replace orig, orig_end, newinstr, ft_flags, args:vararg; .skip -(((.Lnew_end_\@ - .Lnew_\@) - (\orig_end - \orig)) > 0) * ((.Lnew_end_\@ - .Lnew_\@) - (\orig_end - \orig)), 0x90; .Lskip_end_\@: .pushsection .altinstr_replacement, "ax"; .Lnew_\@: \newinstr; .Lnew_end_\@: .popsection; .pushsection .altinstructions, "a"; .long \orig - .; .long .Lnew_\@ - .; .4byte \ft_flags; .byte .Lskip_end_\@ - \orig; .byte .Lnew_end_\@ - .Lnew_\@; .popsection; .ifnb \args; __alt_replace \orig, \orig_end, \args; .endif; .endm; .macro alternative oldinstr, args:vararg; .Lorig_\@: \oldinstr; .Lorig_end_\@: __alt_replace .Lorig_\@, .Lorig_end_\@, \args; .endm; .endif alternative "", "stac", "( 9*32+20)" /* X86_FEATURE_SMAP */ # </ALTERNATIVE> which is much more readable if you ignore the horrendous first line ;-) -- Josh
© 2016 - 2025 Red Hat, Inc.