[PATCH 4/4] x86/boot: Use fastcall for 32bit code

Andrew Cooper posted 4 patches 2 months, 2 weeks ago
[PATCH 4/4] x86/boot: Use fastcall for 32bit code
Posted by Andrew Cooper 2 months, 2 weeks ago
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Frediano Ziglio <frediano.ziglio@cloud.com>

RFC.  This doesn't boot, but I haven't quite figured out where yet.
---
 xen/arch/x86/boot/Makefile  |  2 +-
 xen/arch/x86/boot/cmdline.c |  7 +++----
 xen/arch/x86/boot/head.S    | 15 +++++++++------
 xen/arch/x86/boot/reloc.c   | 12 ++++++------
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/boot/Makefile b/xen/arch/x86/boot/Makefile
index 03d8ce3a9e48..b327663fba94 100644
--- a/xen/arch/x86/boot/Makefile
+++ b/xen/arch/x86/boot/Makefile
@@ -13,7 +13,7 @@ $(obj)/head.o: $(head-bin-objs:.o=.bin)
 
 CFLAGS_x86_32 := $(subst -m64,-m32 -march=i686,$(XEN_TREEWIDE_CFLAGS))
 $(call cc-options-add,CFLAGS_x86_32,CC,$(EMBEDDED_EXTRA_CFLAGS))
-CFLAGS_x86_32 += -Werror -fno-builtin -g0 -msoft-float
+CFLAGS_x86_32 += -Werror -fno-builtin -g0 -msoft-float -mregparm=3
 ifneq ($(abs_objtree),$(abs_srctree))
 CFLAGS_x86_32 += -I$(objtree)/include
 endif
diff --git a/xen/arch/x86/boot/cmdline.c b/xen/arch/x86/boot/cmdline.c
index bdd5b95c224b..6eddbb37b5b9 100644
--- a/xen/arch/x86/boot/cmdline.c
+++ b/xen/arch/x86/boot/cmdline.c
@@ -20,8 +20,8 @@
 
 /*
  * This entry point is entered from xen/arch/x86/boot/head.S with:
- *   - 0x4(%esp) = &cmdline,
- *   - 0x8(%esp) = &early_boot_opts.
+ *   - %eax      = &cmdline,
+ *   - %ecx      = &early_boot_opts.
  */
 asm (
     "    .text                         \n"
@@ -347,8 +347,7 @@ static void vga_parse(const char *cmdline, early_boot_opts_t *ebo)
 #endif
 
 /* SAF-1-safe */
-void __attribute__((__stdcall__))
-cmdline_parse_early(const char *cmdline, early_boot_opts_t *ebo)
+void cmdline_parse_early(const char *cmdline, early_boot_opts_t *ebo)
 {
     if ( !cmdline )
         return;
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index d867b015d961..b745a7cd3a26 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -613,10 +613,13 @@ trampoline_setup:
 
         /* Save Multiboot / PVH info struct (after relocation) for later use. */
         push    %edx                /* Boot video info to be filled from MB2. */
-        push    %ecx                /* Bottom-most low-memory stack address. */
-        push    %ebx                /* Multiboot / PVH information address. */
-        push    %eax                /* Magic number. */
+        mov     %ecx, %edx          /* Bottom-most low-memory stack address. */
+        mov     %ebx, %ecx          /* Multiboot / PVH information address. */
+                                    /* Magic number. */
+        /*      reloc(magic, info, trampoline, video) using fastcall(a, c, d, stk). */
         call    reloc
+        add     $4, %esp
+
 #ifdef CONFIG_PVH_GUEST
         cmpb    $0, sym_esi(pvh_boot)
         je      1f
@@ -848,9 +851,9 @@ trampoline_setup:
         testl   $MBI_CMDLINE,MB_flags(%ebx)
         jz      1f
 
-        lea     sym_esi(early_boot_opts),%eax
-        push    %eax
-        pushl   MB_cmdline(%ebx)
+        lea     sym_esi(early_boot_opts), %ecx
+        lea     MB_cmdline(%ebx), %eax
+        /*      cmdline_parse_early(cmdline, opts) using fastcall(a, c). */
         call    cmdline_parse_early
 
 1:
diff --git a/xen/arch/x86/boot/reloc.c b/xen/arch/x86/boot/reloc.c
index 1e3a30fab0c6..32acded81d78 100644
--- a/xen/arch/x86/boot/reloc.c
+++ b/xen/arch/x86/boot/reloc.c
@@ -14,10 +14,10 @@
 
 /*
  * This entry point is entered from xen/arch/x86/boot/head.S with:
- *   - 0x04(%esp) = MAGIC,
- *   - 0x08(%esp) = INFORMATION_ADDRESS,
- *   - 0x0c(%esp) = TOPMOST_LOW_MEMORY_STACK_ADDRESS.
- *   - 0x10(%esp) = BOOT_VIDEO_INFO_ADDRESS.
+ *   - %eax       = MAGIC,
+ *   - %ecx       = INFORMATION_ADDRESS,
+ *   - %edx       = TOPMOST_LOW_MEMORY_STACK_ADDRESS.
+ *   - 0x04(%esp) = BOOT_VIDEO_INFO_ADDRESS.
  */
 asm (
     "    .text                         \n"
@@ -353,8 +353,8 @@ static multiboot_info_t *mbi2_reloc(uint32_t mbi_in, uint32_t video_out)
 }
 
 /* SAF-1-safe */
-void *__attribute__((__stdcall__))
-reloc(uint32_t magic, uint32_t in, uint32_t trampoline, uint32_t video_info)
+void *reloc(uint32_t magic, uint32_t in, uint32_t trampoline,
+            uint32_t video_info)
 {
     alloc = trampoline;
 
-- 
2.39.2


Re: [PATCH 4/4] x86/boot: Use fastcall for 32bit code
Posted by Jan Beulich 2 months, 2 weeks ago
On 02.09.2024 15:32, Andrew Cooper wrote:
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Frediano Ziglio <frediano.ziglio@cloud.com>
> 
> RFC.  This doesn't boot, but I haven't quite figured out where yet.

Because you got the register use wrong maybe? I think it's %eax, %edx,
and then %ecx.

Jan


Re: [PATCH 4/4] x86/boot: Use fastcall for 32bit code
Posted by Andrew Cooper 2 months, 2 weeks ago
On 02/09/2024 4:39 pm, Jan Beulich wrote:
> On 02.09.2024 15:32, Andrew Cooper wrote:
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Frediano Ziglio <frediano.ziglio@cloud.com>
>>
>> RFC.  This doesn't boot, but I haven't quite figured out where yet.
> Because you got the register use wrong maybe? I think it's %eax, %edx,
> and then %ecx.

Bah, yes it is.  Also,

@@ -848,9 +851,9 @@ trampoline_setup:
         testl   $MBI_CMDLINE,MB_flags(%ebx)
         jz      1f
 
-        lea     sym_esi(early_boot_opts),%eax
-        push    %eax
-        pushl   MB_cmdline(%ebx)
+        lea     sym_esi(early_boot_opts), %ecx
+        lea     MB_cmdline(%ebx), %eax
+        /*      cmdline_parse_early(cmdline, opts) using fastcall(a, c). */
         call    cmdline_parse_early
 
 1:

is a wrong transformation I think.  The second leal should be mov.

I'll have another play when I've got some time, but for now I'll commit
the prior 3 patch to get rid of defs.h.

~Andrew

[PATCH v2] x86/boot: Use fastcall for 32bit code
Posted by Andrew Cooper 2 months, 2 weeks ago
This is marginally more efficient, but is mostly to get rid of the use of
stdcall in cmdline.c and reloc.c

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Frediano Ziglio <frediano.ziglio@cloud.com>

v2:
 * Fixed up to work properly.

I'm tempted to rebase this ahead of "[PATCH 3/4] x86/boot: Use
<xen/compiler.h>" and remove the transatory opencoding of __stdcall.

With some manual ELF annotations around the incbin's, this is the marginal
delta:

  $ ../scripts/bloat-o-meter xen-syms-before xen-syms-after
  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-188 (-188)
  Function                                     old     new   delta
  reloc                                       2352    2328     -24
  cmdline_parse_early                         2699    2535    -164
---
 xen/arch/x86/boot/Makefile  |  2 +-
 xen/arch/x86/boot/cmdline.c |  7 +++----
 xen/arch/x86/boot/head.S    | 15 ++++++++-------
 xen/arch/x86/boot/reloc.c   | 12 ++++++------
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/xen/arch/x86/boot/Makefile b/xen/arch/x86/boot/Makefile
index 03d8ce3a9e48..b327663fba94 100644
--- a/xen/arch/x86/boot/Makefile
+++ b/xen/arch/x86/boot/Makefile
@@ -13,7 +13,7 @@ $(obj)/head.o: $(head-bin-objs:.o=.bin)
 
 CFLAGS_x86_32 := $(subst -m64,-m32 -march=i686,$(XEN_TREEWIDE_CFLAGS))
 $(call cc-options-add,CFLAGS_x86_32,CC,$(EMBEDDED_EXTRA_CFLAGS))
-CFLAGS_x86_32 += -Werror -fno-builtin -g0 -msoft-float
+CFLAGS_x86_32 += -Werror -fno-builtin -g0 -msoft-float -mregparm=3
 ifneq ($(abs_objtree),$(abs_srctree))
 CFLAGS_x86_32 += -I$(objtree)/include
 endif
diff --git a/xen/arch/x86/boot/cmdline.c b/xen/arch/x86/boot/cmdline.c
index bdd5b95c224b..fc9241ede9a0 100644
--- a/xen/arch/x86/boot/cmdline.c
+++ b/xen/arch/x86/boot/cmdline.c
@@ -20,8 +20,8 @@
 
 /*
  * This entry point is entered from xen/arch/x86/boot/head.S with:
- *   - 0x4(%esp) = &cmdline,
- *   - 0x8(%esp) = &early_boot_opts.
+ *   - %eax      = &cmdline,
+ *   - %edx      = &early_boot_opts.
  */
 asm (
     "    .text                         \n"
@@ -347,8 +347,7 @@ static void vga_parse(const char *cmdline, early_boot_opts_t *ebo)
 #endif
 
 /* SAF-1-safe */
-void __attribute__((__stdcall__))
-cmdline_parse_early(const char *cmdline, early_boot_opts_t *ebo)
+void cmdline_parse_early(const char *cmdline, early_boot_opts_t *ebo)
 {
     if ( !cmdline )
         return;
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index 3c0346016ce8..f7497bce393c 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -613,10 +613,11 @@ trampoline_setup:
 
         /* Save Multiboot / PVH info struct (after relocation) for later use. */
         push    %edx                /* Boot video info to be filled from MB2. */
-        push    %ecx                /* Bottom-most low-memory stack address. */
-        push    %ebx                /* Multiboot / PVH information address. */
-        push    %eax                /* Magic number. */
+        mov     %ebx, %edx          /* Multiboot / PVH information address. */
+        /*      reloc(magic/eax, info/edx, trampoline/ecx, video/stk) using fastcall. */
         call    reloc
+        add     $4, %esp
+
 #ifdef CONFIG_PVH_GUEST
         cmpb    $0, sym_esi(pvh_boot)
         je      1f
@@ -848,9 +849,9 @@ trampoline_setup:
         testl   $MBI_CMDLINE,MB_flags(%ebx)
         jz      1f
 
-        lea     sym_esi(early_boot_opts),%eax
-        push    %eax
-        pushl   MB_cmdline(%ebx)
+        lea     sym_esi(early_boot_opts), %edx
+        mov     MB_cmdline(%ebx), %eax
+        /*      cmdline_parse_early(cmdline/eax, opts/edx) using fastcall. */
         call    cmdline_parse_early
 
 1:
@@ -871,7 +872,7 @@ trampoline_setup:
 
         /*
          * cmdline and reloc are written in C, and linked to be 32bit PIC with
-         * entrypoints at 0 and using the stdcall convention.
+         * entrypoints at 0 and using the fastcall convention.
          */
 FUNC(cmdline_parse_early)
         .incbin "cmdline.bin"
diff --git a/xen/arch/x86/boot/reloc.c b/xen/arch/x86/boot/reloc.c
index 1e3a30fab0c6..201e38d5445d 100644
--- a/xen/arch/x86/boot/reloc.c
+++ b/xen/arch/x86/boot/reloc.c
@@ -14,10 +14,10 @@
 
 /*
  * This entry point is entered from xen/arch/x86/boot/head.S with:
- *   - 0x04(%esp) = MAGIC,
- *   - 0x08(%esp) = INFORMATION_ADDRESS,
- *   - 0x0c(%esp) = TOPMOST_LOW_MEMORY_STACK_ADDRESS.
- *   - 0x10(%esp) = BOOT_VIDEO_INFO_ADDRESS.
+ *   - %eax       = MAGIC,
+ *   - %edx       = INFORMATION_ADDRESS,
+ *   - %ecx       = TOPMOST_LOW_MEMORY_STACK_ADDRESS.
+ *   - 0x04(%esp) = BOOT_VIDEO_INFO_ADDRESS.
  */
 asm (
     "    .text                         \n"
@@ -353,8 +353,8 @@ static multiboot_info_t *mbi2_reloc(uint32_t mbi_in, uint32_t video_out)
 }
 
 /* SAF-1-safe */
-void *__attribute__((__stdcall__))
-reloc(uint32_t magic, uint32_t in, uint32_t trampoline, uint32_t video_info)
+void *reloc(uint32_t magic, uint32_t in, uint32_t trampoline,
+            uint32_t video_info)
 {
     alloc = trampoline;
 

base-commit: e884903ec56be8b0cc658cc5ba7c1cb70f25208d
prerequisite-patch-id: eb50bac1dc5f4cec5653dfe46bb2c714035c8235
prerequisite-patch-id: f08a81c17bc5d1fbc73ad681dd834e077929fbd4
prerequisite-patch-id: ee066a61cacb309e0a9a186051a380ab985e7d63
prerequisite-patch-id: 78e4e47fd945ab266604b208a735601eed301017
-- 
2.39.2


Re: [PATCH v2] x86/boot: Use fastcall for 32bit code
Posted by Jan Beulich 2 months, 2 weeks ago
On 02.09.2024 18:54, Andrew Cooper wrote:
> This is marginally more efficient, but is mostly to get rid of the use of
> stdcall in cmdline.c and reloc.c
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>

> v2:
>  * Fixed up to work properly.
> 
> I'm tempted to rebase this ahead of "[PATCH 3/4] x86/boot: Use
> <xen/compiler.h>" and remove the transatory opencoding of __stdcall.

Fine with me either way.

Jan