[PATCH RFC 4/5] x86/alternative: Improve code generation readability

Josh Poimboeuf posted 5 patches 8 months, 2 weeks ago
[PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months, 2 weeks ago
Improve the readability and compactness of alternatives code.

---------------------
ALTERNATIVE() before:
---------------------

  	# ALT: oldinstr
  771:
  	rep movsb
  772:
  # ALT: padding
  .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
  773:
  .pushsection .altinstructions,"a"
   .long 771b - .
   .long 774f - .
   .4byte (((1 << 0) << 16) | ((18*32+ 4)))
   .byte 773b-771b
   .byte 775f-774f
  .popsection
  .pushsection .altinstr_replacement, "ax"
  # ALT: replacement
  774:
  	call rep_movs_alternative
  775:
  .popsection

--------------------
ALTERNATIVE() after:
--------------------

 # <ALTERNATIVE>
 771:	rep movsb
 772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 # ALT ENTRY:
 .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte (((1 << 0) << 16) | ((18*32+ 4)));  .byte 773b-771b;  .byte 775f-774f; .popsection
 # ALT REPLACEMENT:
 .pushsection .altinstr_replacement, "ax"
 774:	call rep_movs_alternative
 775:
 .popsection
 # </ALTERNATIVE>

-----------------------
ALTERNATIVE_2() before:
-----------------------

 	# ALT: oldinstr
 771:
 	# ALT: oldinstr
 771:
 	jmp 6f
 772:
 # ALT: padding
 .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 .pushsection .altinstructions,"a"
  .long 771b - .
  .long 774f - .
  .4byte ( 3*32+21)
  .byte 773b-771b
  .byte 775f-774f
 .popsection
 .pushsection .altinstr_replacement, "ax"
 # ALT: replacement
 774:
 	jmp .L4	#
 775:
 .popsection

 772:
 # ALT: padding
 .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 .pushsection .altinstructions,"a"
  .long 771b - .
  .long 774f - .
  .4byte 297	#
  .byte 773b-771b
  .byte 775f-774f
 .popsection
 .pushsection .altinstr_replacement, "ax"
 # ALT: replacement
 774:

 775:
 .popsection

----------------------
ALTERNATIVE_2() after:
----------------------

 # <ALTERNATIVE_2>
 771:
 771:	jmp 6f
 772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 # ALT ENTRY:
 .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte ( 3*32+21);  .byte 773b-771b;  .byte 775f-774f; .popsection
 # ALT REPLACEMENT:
 .pushsection .altinstr_replacement, "ax"
 774:	jmp .L4	#
 775:
 .popsection
 772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 # ALT ENTRY:
 .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte 297;  .byte 773b-771b;  .byte 775f-774f; .popsection	#
 # ALT REPLACEMENT:
 .pushsection .altinstr_replacement, "ax"
 774:
 775:
 .popsection
 # </ALTERNATIVE_2>

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/include/asm/alternative.h | 88 +++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 4a37a8bd87fd..6472d53625dc 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -151,46 +151,84 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alt_rlen		"775f-774f"
 
 #define OLDINSTR(oldinstr)						\
-	"# ALT: oldinstr\n"						\
-	"771:\n\t" oldinstr "\n772:\n"					\
-	"# ALT: padding\n"						\
-	".skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * "		\
-		"((" alt_rlen ")-(" alt_slen ")),0x90\n"		\
+	"771:" oldinstr "\n"						\
+	"772:\t.skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * "		\
+		      "((" alt_rlen ")-(" alt_slen ")),0x90\n"		\
 	"773:\n"
 
-#define ALTINSTR_ENTRY(ft_flags)					      \
-	".pushsection .altinstructions,\"a\"\n"				      \
-	" .long 771b - .\n"				/* label           */ \
-	" .long 774f - .\n"				/* new instruction */ \
-	" .4byte " __stringify(ft_flags) "\n"		/* feature + flags */ \
-	" .byte " alt_total_slen "\n"			/* source len      */ \
-	" .byte " alt_rlen "\n"				/* replacement len */ \
+#define ALTINSTR_ENTRY(ft_flags)					\
+	"# ALT ENTRY:\n"						\
+	".pushsection .altinstructions,\"a\"; "				\
+	" .long 771b - .; "			/* label           */	\
+	" .long 774f - .; "			/* new instruction */	\
+	" .4byte " __stringify(ft_flags) "; "	/* feature + flags */	\
+	" .byte " alt_total_slen "; "		/* source len      */	\
+	" .byte " alt_rlen "; "			/* replacement len */	\
 	".popsection\n"
 
-#define ALTINSTR_REPLACEMENT(newinstr)		/* replacement */	\
+#define ALTINSTR_REPLACEMENT(newinstr)					\
+	"# ALT REPLACEMENT:\n"						\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
-	"# ALT: replacement\n"						\
-	"774:\n\t" newinstr "\n775:\n"					\
-	".popsection\n"
+	"774:\t" newinstr "\n"						\
+	"775:\n"							\
+	".popsection"
 
-/* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, ft_flags)			\
+
+#define __ALTERNATIVE(oldinstr, newinstr, ft_flags)			\
 	OLDINSTR(oldinstr)						\
 	ALTINSTR_ENTRY(ft_flags)					\
 	ALTINSTR_REPLACEMENT(newinstr)
 
-#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
-	ALTERNATIVE(ALTERNATIVE(oldinstr, newinstr1, ft_flags1), newinstr2, ft_flags2)
+#define __ALTERNATIVE_2(oldinstr,					\
+			newinstr1, ft_flags1,				\
+			newinstr2, ft_flags2)				\
+	__ALTERNATIVE("\n"						\
+		      __ALTERNATIVE(oldinstr,				\
+				    newinstr1, ft_flags1),		\
+		      newinstr2, ft_flags2)
+
+#define __ALTERNATIVE_3(oldinstr,					\
+			newinstr1, ft_flags1,				\
+			newinstr2, ft_flags2,				\
+			newinstr3, ft_flags3)				\
+	__ALTERNATIVE("\n"						\
+		      __ALTERNATIVE_2(oldinstr,				\
+				      newinstr1, ft_flags1,		\
+				      newinstr2, ft_flags2),		\
+		      newinstr3, ft_flags3)
+
+
+#define ALTERNATIVE(oldinstr, newinstr, ft_flags)			\
+	"\n# <ALTERNATIVE>\n"						\
+	__ALTERNATIVE("\t" oldinstr, newinstr, ft_flags)		\
+	"\n# </ALTERNATIVE>\n"
+
+#define ALTERNATIVE_2(oldinstr,						\
+		      newinstr1, ft_flags1,				\
+		      newinstr2, ft_flags2)				\
+	"\n# <ALTERNATIVE_2>\n"						\
+	__ALTERNATIVE_2("\t"						\
+			oldinstr,					\
+			newinstr1, ft_flags1,				\
+			newinstr2, ft_flags2)				\
+	"\n# </ALTERNATIVE_2>\n"
+
+#define ALTERNATIVE_3(oldinstr,						\
+		      newinstr1, ft_flags1,				\
+		      newinstr2, ft_flags2,				\
+		      newinstr3, ft_flags3)				\
+	"\n# <ALTERNATIVE_3>\n"						\
+	__ALTERNATIVE_3("\t"						\
+			oldinstr,					\
+			newinstr1, ft_flags1,				\
+			newinstr2, ft_flags2,				\
+			newinstr3, ft_flags3)				\
+	"\n# </ALTERNATIVE_3>\n"
 
 /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
 #define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
 	ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, newinstr_yes, ft_flags)
 
-#define ALTERNATIVE_3(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, \
-			newinstr3, ft_flags3)				\
-	ALTERNATIVE(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2), \
-		      newinstr3, ft_flags3)
-
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
-- 
2.49.0
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Peter Zijlstra 8 months, 1 week ago
On Tue, Apr 08, 2025 at 01:21:17AM -0700, Josh Poimboeuf wrote:
> Improve the readability and compactness of alternatives code.
> 
> ---------------------
> ALTERNATIVE() before:
> ---------------------
> 
>   	# ALT: oldinstr
>   771:
>   	rep movsb
>   772:
>   # ALT: padding
>   .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
>   773:
>   .pushsection .altinstructions,"a"
>    .long 771b - .
>    .long 774f - .
>    .4byte (((1 << 0) << 16) | ((18*32+ 4)))
>    .byte 773b-771b
>    .byte 775f-774f
>   .popsection
>   .pushsection .altinstr_replacement, "ax"
>   # ALT: replacement
>   774:
>   	call rep_movs_alternative
>   775:
>   .popsection
> 
> --------------------
> ALTERNATIVE() after:
> --------------------
> 
>  # <ALTERNATIVE>
>  771:	rep movsb
>  772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
>  773:
>  # ALT ENTRY:
>  .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte (((1 << 0) << 16) | ((18*32+ 4)));  .byte 773b-771b;  .byte 775f-774f; .popsection

I find this very hard to read. I prefer the multi line form it had
before.

Other than that, I like the new layout.

>  # ALT REPLACEMENT:
>  .pushsection .altinstr_replacement, "ax"
>  774:	call rep_movs_alternative
>  775:
>  .popsection
>  # </ALTERNATIVE>
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months, 1 week ago
On Wed, Apr 09, 2025 at 04:38:21PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 08, 2025 at 01:21:17AM -0700, Josh Poimboeuf wrote:
> > Improve the readability and compactness of alternatives code.
> > 
> > ---------------------
> > ALTERNATIVE() before:
> > ---------------------
> > 
> >   	# ALT: oldinstr
> >   771:
> >   	rep movsb
> >   772:
> >   # ALT: padding
> >   .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
> >   773:
> >   .pushsection .altinstructions,"a"
> >    .long 771b - .
> >    .long 774f - .
> >    .4byte (((1 << 0) << 16) | ((18*32+ 4)))
> >    .byte 773b-771b
> >    .byte 775f-774f
> >   .popsection
> >   .pushsection .altinstr_replacement, "ax"
> >   # ALT: replacement
> >   774:
> >   	call rep_movs_alternative
> >   775:
> >   .popsection
> > 
> > --------------------
> > ALTERNATIVE() after:
> > --------------------
> > 
> >  # <ALTERNATIVE>
> >  771:	rep movsb
> >  772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
> >  773:
> >  # ALT ENTRY:
> >  .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte (((1 << 0) << 16) | ((18*32+ 4)));  .byte 773b-771b;  .byte 775f-774f; .popsection
> 
> I find this very hard to read. I prefer the multi line form it had
> before.

I actually think the compactness of putting the entry on a single line
is really nice.

We're usually more interested in reading the code *around* the
alternatives, rather than the alternatives themselves.

Making the alt entry a big sprawling mess, for what typically resolves
to just a few instructions, makes that a LOT harder.  And it's *really*
bad for ALTERNATIVE_2() and ALTERNATIVE_3().

Also, 99% of the time, we're not going to be debugging the alternatives
themselves.  Spreading out the alt entry directives across multiple
lines is way overkill.  It wastes space and cognitive load.

That's what the comments are for.  All you really need to care about are
the original instructions, the replacement instructions, and potentially
what feature+flags.

We could even print the feature by adding the '(((1 << 0) << 16) |
((18*32+ 4)))' to one of the comments.

Or even better, we can just show the actual human readable feature:
ALT_NOT(X86_FEATURE_FSRM).

Something like:

 # <ALTERNATIVE>
 771:	rep movsb
 772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 # ALT ENTRY:
 .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte (((1 << 0) << 16) | ((18*32+ 4)));  .byte 773b-771b;  .byte 775f-774f; .popsection
 # ALT REPLACEMENT: ALT_NOT(X86_FEATURE_FSRM):
 .pushsection .altinstr_replacement, "ax"
 774:	call rep_movs_alternative
 775:
 .popsection
 # </ALTERNATIVE>

Then with all the relevant information in the comments, there's no
reason to make the alt entry directives themselves very readable, right?
Unless alternatives are broken and you're debugging it.  Which should
very much be the exception rather than the rule.

One thing that does bother me slightly is the asymmetry of having the
.pushsection and .popsection on the same line, whereas the replacement
has them on separate lines.  We could just put the .popsection on its
own line:

 # <ALTERNATIVE>
 771:	rep movsb
 772:	.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
 773:
 # ALT ENTRY:
 .pushsection .altinstructions,"a";  .long 771b - .;  .long 774f - .;  .4byte (((1 << 0) << 16) | ((18*32+ 4)));  .byte 773b-771b;  .byte 775f-774f
 .popsection
 # ALT REPLACEMENT: ALT_NOT(X86_FEATURE_FSRM):
 .pushsection .altinstr_replacement, "ax"
 774:	call rep_movs_alternative
 775:
 .popsection
 # </ALTERNATIVE>

-- 
Josh
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Linus Torvalds 8 months, 1 week ago
On Wed, 9 Apr 2025 at 10:41, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> I actually think the compactness of putting the entry on a single line
> is really nice.

Yeah, I think the noise with size calculations in particular is stuff
that I never look at, and making it one long line is better than
multiple lines, I think.

That said, for some of the alternatives, it would be even nicer if we
could make the noise more compact by just hardcoding sizes. Our
generic alternatives macros do result in some *very* verbose output
that is entirely illegible to humans.

And yes, we need that generic case for most of them, since the
instruction size will depend on register choice and the modrm
addressing details etc.

But some of them are kind of pointless.

I did have something that just knew that 'clac'/'stac' were three-byte
instructions (and it was very obvious indeed back when we encoded them
as such, using the ".byte" syntax).

And that avoided a *lot* of noise in the alternatives section, and
removed the subsequent ".skip" part entirely because both sides would
just use the right size without any calculations.

I can't find that hacky patch of mine, and I didn't keep it around
because it *was* hacky, but maybe I'll try to resurrect a cleaner
version of it.

There may not be very many people who care, because yeah, it really
only shows up when looking at the compiler-generated assembler output,
and I agree that that isn't exactly _common_.

              Linus
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months ago
On Wed, Apr 09, 2025 at 11:02:29AM -0700, Linus Torvalds wrote:
> On Wed, 9 Apr 2025 at 10:41, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > I actually think the compactness of putting the entry on a single line
> > is really nice.
> 
> Yeah, I think the noise with size calculations in particular is stuff
> that I never look at, and making it one long line is better than
> multiple lines, I think.

We could just give up on trying to making the code itself readable, and
instead put the code+features in comments, e.g.:

# <ALTERNATIVE>
#
# ORIG:	rep movsb
# NEW:	call rep_movs_alternative	# ALT_NOT(X86_FEATURE_FSRM)
#
771: rep movsb; 772: .skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90; 773: .pushsection .altinstructions,"a"; .long 771b - .; .long 774f - .; .4byte (((1 << 0) << 16) | ((18*32+ 4))); .byte 773b-771b; .byte 775f-774f; .popsection; .pushsection .altinstr_replacement, "ax"; 774: call rep_movs_alternative; 775: .popsection
# </ALTERNATIVE>

-- 
Josh
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months, 1 week ago
On Wed, Apr 09, 2025 at 11:02:29AM -0700, Linus Torvalds wrote:
> On Wed, 9 Apr 2025 at 10:41, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > I actually think the compactness of putting the entry on a single line
> > is really nice.
> 
> Yeah, I think the noise with size calculations in particular is stuff
> that I never look at, and making it one long line is better than
> multiple lines, I think.
> 
> That said, for some of the alternatives, it would be even nicer if we
> could make the noise more compact by just hardcoding sizes. Our
> generic alternatives macros do result in some *very* verbose output
> that is entirely illegible to humans.

What if we were to use a global asm() to define an alternative .macro
whenever "alternative.h" gets included?

e.g.:

asm(".macro alternative oldinstr, newinstr, ft_flags\n"
    "771:\t\\oldinstr\n"
    "772:\t.skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * "
		  "((" alt_rlen ")-(" alt_slen ")),0x90\n"
    "773:\n"
    ".pushsection .altinstructions,\"a\"\n\t"
    ".long 771b - .\n\t"			/* label           */
    ".long 774f - .\n\t "			/* new instruction */
    ".4byte \\ft_flags\n\t"			/* feature + flags */
    ".byte " alt_total_slen "\n\t"		/* source len      */
    ".byte " alt_rlen "\n\t"			/* replacement len */
    ".popsection\n"
    ".pushsection .altinstr_replacement, \"ax\"\n"
    "774:\t\\newinstr\n"
    "775:\n"
    ".popsection\n"
    ".endm");

#define ALTERNATIVE(oldinstr, newinstr, ft_flags)			\
	"alternative \"" oldinstr "\", \"" newinstr "\", \"" __stringify(ft_flags) "\"; "

Then it becomes extremely readable:

  alternative "", "stac", "( 9*32+20)";

Of course the downside is the macro gets generated even if it's never
used.

-- 
Josh
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Linus Torvalds 8 months, 1 week ago
On Wed, 9 Apr 2025 at 12:51, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> What if we were to use a global asm() to define an alternative .macro
> whenever "alternative.h" gets included?

Yeah, I wouldn't mind that, but I have this dim memory of us having
tried it at some point and it didn't work.

I think the issue was that the in-compiler assembler was not as
complete as the external one (ie not doing macros at all or something
like that).

             Linus
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months, 1 week ago
On Wed, Apr 09, 2025 at 02:20:55PM -0700, Linus Torvalds wrote:
> On Wed, 9 Apr 2025 at 12:51, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > What if we were to use a global asm() to define an alternative .macro
> > whenever "alternative.h" gets included?
> 
> Yeah, I wouldn't mind that, but I have this dim memory of us having
> tried it at some point and it didn't work.
> 
> I think the issue was that the in-compiler assembler was not as
> complete as the external one (ie not doing macros at all or something
> like that).

It seems to work with GCC 14 and Clang 18 at least.  I can try to find
some old toolchains to test with.

-- 
Josh
Re: [PATCH RFC 4/5] x86/alternative: Improve code generation readability
Posted by Josh Poimboeuf 8 months, 1 week ago
On Wed, Apr 09, 2025 at 02:27:28PM -0700, Josh Poimboeuf wrote:
> On Wed, Apr 09, 2025 at 02:20:55PM -0700, Linus Torvalds wrote:
> > On Wed, 9 Apr 2025 at 12:51, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > >
> > > What if we were to use a global asm() to define an alternative .macro
> > > whenever "alternative.h" gets included?
> > 
> > Yeah, I wouldn't mind that, but I have this dim memory of us having
> > tried it at some point and it didn't work.
> > 
> > I think the issue was that the in-compiler assembler was not as
> > complete as the external one (ie not doing macros at all or something
> > like that).
> 
> It seems to work with GCC 14 and Clang 18 at least.  I can try to find
> some old toolchains to test with.

Actually, Clang *compiled*, but on closer inspection it's actually
silently omitting the inline asm :-/

So yeah, that's not going to work.

-- 
Josh