x86 boot code cleanups, batch 1

[PATCH v1 11/14] x86/boot: use __seg_fs and __seg_gs in the real-mode boot code

Posted by H. Peter Anvin 2 weeks, 5 days ago

All supported versions of gcc support __seg_fs and __seg_gs now.
All supported versions of clang support __seg_fs and __seg_gs too,
except for two bugs (as of clang 21, at least):

1. The %fs: and %gs: prefix does not get emitted in inline assembly.
2. An internal compiler error when addressing symbols directly.

However, none of these are required in the boot code. Furthermore,
this makes it possible to remove the absolute_pointer() hack in the
fs/gs access functions.

This requires adding a barrier() to a20.c, to prevent the compiler
from eliding the load from the aliased memory address.

Remove the unused memcmp_[fg]s() functions.

Finally, ds() is by necessity constant, so mark the function as such.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
---
 arch/x86/boot/a20.c  |  1 +
 arch/x86/boot/boot.h | 81 ++++++++++++++------------------------------
 2 files changed, 27 insertions(+), 55 deletions(-)

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 3ab6cd8eaa31..52c3fccdcb70 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -63,6 +63,7 @@ static int a20_test(int loops)
 	while (loops--) {
 		wrgs32(++ctr, A20_TEST_ADDR);
 		io_delay();	/* Serialize and make delay constant */
+		barrier();	/* Compiler won't know about fs/gs overlap */
 		ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
 		if (ok)
 			break;
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index b4eb8405ba55..4d3549ed7987 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -45,7 +45,7 @@ static inline void io_delay(void)
 
 /* These functions are used to reference data in other segments. */
 
-static inline u16 ds(void)
+static inline __attribute_const__ u16 ds(void)
 {
 	u16 seg;
 	asm("movw %%ds,%0" : "=rm" (seg));
@@ -54,7 +54,7 @@ static inline u16 ds(void)
 
 static inline void set_fs(u16 seg)
 {
-	asm volatile("movw %0,%%fs" : : "rm" (seg));
+	asm volatile("movw %0,%%fs" : : "rm" (seg) : "memory");
 }
 static inline u16 fs(void)
 {
@@ -65,7 +65,7 @@ static inline u16 fs(void)
 
 static inline void set_gs(u16 seg)
 {
-	asm volatile("movw %0,%%gs" : : "rm" (seg));
+	asm volatile("movw %0,%%gs" : : "rm" (seg) : "memory");
 }
 static inline u16 gs(void)
 {
@@ -76,96 +76,67 @@ static inline u16 gs(void)
 
 typedef unsigned int addr_t;
 
+/*
+ * WARNING: as of clang 21, clang has the following two bugs related
+ * to __seg_fs and __seg_gs:
+ *
+ * 1. The %fs: and %gs: prefix does not get emitted in inline assembly.
+ * 2. An internal compiler error when addressing symbols directly.
+ *
+ * Neither of those constructs are currently used in the boot code.
+ * If they ever are, and those bugs still remain, then those bugs will
+ * need to be worked around.
+ */
 static inline u8 rdfs8(addr_t addr)
 {
-	u8 *ptr = (u8 *)absolute_pointer(addr);
-	u8 v;
-	asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_fs const u8 *)addr;
 }
 static inline u16 rdfs16(addr_t addr)
 {
-	u16 *ptr = (u16 *)absolute_pointer(addr);
-	u16 v;
-	asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_fs const u16 *)addr;
 }
 static inline u32 rdfs32(addr_t addr)
 {
-	u32 *ptr = (u32 *)absolute_pointer(addr);
-	u32 v;
-	asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_fs const u32 *)addr;
 }
 
 static inline void wrfs8(u8 v, addr_t addr)
 {
-	u8 *ptr = (u8 *)absolute_pointer(addr);
-	asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v));
+	*(__seg_fs u8 *)addr = v;
 }
 static inline void wrfs16(u16 v, addr_t addr)
 {
-	u16 *ptr = (u16 *)absolute_pointer(addr);
-	asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
+	*(__seg_fs u16 *)addr = v;
 }
 static inline void wrfs32(u32 v, addr_t addr)
 {
-	u32 *ptr = (u32 *)absolute_pointer(addr);
-	asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
+	*(__seg_fs u32 *)addr = v;
 }
 
 static inline u8 rdgs8(addr_t addr)
 {
-	u8 *ptr = (u8 *)absolute_pointer(addr);
-	u8 v;
-	asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_gs const u8 *)addr;
 }
 static inline u16 rdgs16(addr_t addr)
 {
-	u16 *ptr = (u16 *)absolute_pointer(addr);
-	u16 v;
-	asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_gs const u16 *)addr;
 }
 static inline u32 rdgs32(addr_t addr)
 {
-	u32 *ptr = (u32 *)absolute_pointer(addr);
-	u32 v;
-	asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
-	return v;
+	return *(__seg_gs const u32 *)addr;
 }
 
 static inline void wrgs8(u8 v, addr_t addr)
 {
-	u8 *ptr = (u8 *)absolute_pointer(addr);
-	asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v));
+	*(__seg_gs u8 *)addr = v;
 }
 static inline void wrgs16(u16 v, addr_t addr)
 {
-	u16 *ptr = (u16 *)absolute_pointer(addr);
-	asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
+	*(__seg_gs u16 *)addr = v;
 }
 static inline void wrgs32(u32 v, addr_t addr)
 {
-	u32 *ptr = (u32 *)absolute_pointer(addr);
-	asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
-}
-
-/* Note: these only return true/false, not a signed return value! */
-static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
-{
-	bool diff;
-	asm volatile("fs repe cmpsb"
-		     : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
-	return diff;
-}
-static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
-{
-	bool diff;
-	asm volatile("gs repe cmpsb"
-		     : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
-	return diff;
+	*(__seg_gs u32 *)addr = v;
 }
 
 /* Heap -- available for dynamic lists. */
-- 
2.52.0

Re: [PATCH v1 11/14] x86/boot: use __seg_fs and __seg_gs in the real-mode boot code

Posted by Uros Bizjak 2 weeks, 4 days ago

On Tue, Jan 20, 2026 at 8:54 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> All supported versions of gcc support __seg_fs and __seg_gs now.
> All supported versions of clang support __seg_fs and __seg_gs too,
> except for two bugs (as of clang 21, at least):
>
> 1. The %fs: and %gs: prefix does not get emitted in inline assembly.
> 2. An internal compiler error when addressing symbols directly.
>
> However, none of these are required in the boot code. Furthermore,
> this makes it possible to remove the absolute_pointer() hack in the
> fs/gs access functions.
>
> This requires adding a barrier() to a20.c, to prevent the compiler
> from eliding the load from the aliased memory address.
>
> Remove the unused memcmp_[fg]s() functions.
>
> Finally, ds() is by necessity constant, so mark the function as such.
>
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>

Reviewed-by: Uros Bizjak <ubizjak@gmail.com>

> ---
>  arch/x86/boot/a20.c  |  1 +
>  arch/x86/boot/boot.h | 81 ++++++++++++++------------------------------
>  2 files changed, 27 insertions(+), 55 deletions(-)
>
> diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
> index 3ab6cd8eaa31..52c3fccdcb70 100644
> --- a/arch/x86/boot/a20.c
> +++ b/arch/x86/boot/a20.c
> @@ -63,6 +63,7 @@ static int a20_test(int loops)
>         while (loops--) {
>                 wrgs32(++ctr, A20_TEST_ADDR);
>                 io_delay();     /* Serialize and make delay constant */
> +               barrier();      /* Compiler won't know about fs/gs overlap */
>                 ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
>                 if (ok)
>                         break;
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index b4eb8405ba55..4d3549ed7987 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -45,7 +45,7 @@ static inline void io_delay(void)
>
>  /* These functions are used to reference data in other segments. */
>
> -static inline u16 ds(void)
> +static inline __attribute_const__ u16 ds(void)
>  {
>         u16 seg;
>         asm("movw %%ds,%0" : "=rm" (seg));
> @@ -54,7 +54,7 @@ static inline u16 ds(void)
>
>  static inline void set_fs(u16 seg)
>  {
> -       asm volatile("movw %0,%%fs" : : "rm" (seg));
> +       asm volatile("movw %0,%%fs" : : "rm" (seg) : "memory");
>  }
>  static inline u16 fs(void)
>  {
> @@ -65,7 +65,7 @@ static inline u16 fs(void)
>
>  static inline void set_gs(u16 seg)
>  {
> -       asm volatile("movw %0,%%gs" : : "rm" (seg));
> +       asm volatile("movw %0,%%gs" : : "rm" (seg) : "memory");
>  }
>  static inline u16 gs(void)
>  {
> @@ -76,96 +76,67 @@ static inline u16 gs(void)
>
>  typedef unsigned int addr_t;
>
> +/*
> + * WARNING: as of clang 21, clang has the following two bugs related
> + * to __seg_fs and __seg_gs:
> + *
> + * 1. The %fs: and %gs: prefix does not get emitted in inline assembly.
> + * 2. An internal compiler error when addressing symbols directly.
> + *
> + * Neither of those constructs are currently used in the boot code.
> + * If they ever are, and those bugs still remain, then those bugs will
> + * need to be worked around.
> + */
>  static inline u8 rdfs8(addr_t addr)
>  {
> -       u8 *ptr = (u8 *)absolute_pointer(addr);
> -       u8 v;
> -       asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_fs const u8 *)addr;
>  }
>  static inline u16 rdfs16(addr_t addr)
>  {
> -       u16 *ptr = (u16 *)absolute_pointer(addr);
> -       u16 v;
> -       asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_fs const u16 *)addr;
>  }
>  static inline u32 rdfs32(addr_t addr)
>  {
> -       u32 *ptr = (u32 *)absolute_pointer(addr);
> -       u32 v;
> -       asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_fs const u32 *)addr;
>  }
>
>  static inline void wrfs8(u8 v, addr_t addr)
>  {
> -       u8 *ptr = (u8 *)absolute_pointer(addr);
> -       asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v));
> +       *(__seg_fs u8 *)addr = v;
>  }
>  static inline void wrfs16(u16 v, addr_t addr)
>  {
> -       u16 *ptr = (u16 *)absolute_pointer(addr);
> -       asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
> +       *(__seg_fs u16 *)addr = v;
>  }
>  static inline void wrfs32(u32 v, addr_t addr)
>  {
> -       u32 *ptr = (u32 *)absolute_pointer(addr);
> -       asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
> +       *(__seg_fs u32 *)addr = v;
>  }
>
>  static inline u8 rdgs8(addr_t addr)
>  {
> -       u8 *ptr = (u8 *)absolute_pointer(addr);
> -       u8 v;
> -       asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_gs const u8 *)addr;
>  }
>  static inline u16 rdgs16(addr_t addr)
>  {
> -       u16 *ptr = (u16 *)absolute_pointer(addr);
> -       u16 v;
> -       asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_gs const u16 *)addr;
>  }
>  static inline u32 rdgs32(addr_t addr)
>  {
> -       u32 *ptr = (u32 *)absolute_pointer(addr);
> -       u32 v;
> -       asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
> -       return v;
> +       return *(__seg_gs const u32 *)addr;
>  }
>
>  static inline void wrgs8(u8 v, addr_t addr)
>  {
> -       u8 *ptr = (u8 *)absolute_pointer(addr);
> -       asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v));
> +       *(__seg_gs u8 *)addr = v;
>  }
>  static inline void wrgs16(u16 v, addr_t addr)
>  {
> -       u16 *ptr = (u16 *)absolute_pointer(addr);
> -       asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
> +       *(__seg_gs u16 *)addr = v;
>  }
>  static inline void wrgs32(u32 v, addr_t addr)
>  {
> -       u32 *ptr = (u32 *)absolute_pointer(addr);
> -       asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
> -}
> -
> -/* Note: these only return true/false, not a signed return value! */
> -static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
> -{
> -       bool diff;
> -       asm volatile("fs repe cmpsb"
> -                    : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
> -       return diff;
> -}
> -static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
> -{
> -       bool diff;
> -       asm volatile("gs repe cmpsb"
> -                    : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
> -       return diff;
> +       *(__seg_gs u32 *)addr = v;
>  }
>
>  /* Heap -- available for dynamic lists. */
> --
> 2.52.0
>

Re: [PATCH v1 11/14] x86/boot: use __seg_fs and __seg_gs in the real-mode boot code

Posted by Uros Bizjak 2 weeks, 5 days ago

On Tue, Jan 20, 2026 at 8:54 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> All supported versions of gcc support __seg_fs and __seg_gs now.
> All supported versions of clang support __seg_fs and __seg_gs too,
> except for two bugs (as of clang 21, at least):
>
> 1. The %fs: and %gs: prefix does not get emitted in inline assembly.
> 2. An internal compiler error when addressing symbols directly.
>
> However, none of these are required in the boot code. Furthermore,
> this makes it possible to remove the absolute_pointer() hack in the
> fs/gs access functions.
>
> This requires adding a barrier() to a20.c, to prevent the compiler
> from eliding the load from the aliased memory address.
>
> Remove the unused memcmp_[fg]s() functions.
>
> Finally, ds() is by necessity constant, so mark the function as such.
>
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> ---
>  arch/x86/boot/a20.c  |  1 +
>  arch/x86/boot/boot.h | 81 ++++++++++++++------------------------------
>  2 files changed, 27 insertions(+), 55 deletions(-)
>
> diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
> index 3ab6cd8eaa31..52c3fccdcb70 100644
> --- a/arch/x86/boot/a20.c
> +++ b/arch/x86/boot/a20.c
> @@ -63,6 +63,7 @@ static int a20_test(int loops)
>         while (loops--) {
>                 wrgs32(++ctr, A20_TEST_ADDR);
>                 io_delay();     /* Serialize and make delay constant */
> +               barrier();      /* Compiler won't know about fs/gs overlap */
>                 ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
>                 if (ok)
>                         break;

This particular issue is not due to the compiler not knowing about
fs/gs overlap (if the compiler determines that write and read are to
the same non-volatile address, it would simply remove both, load and
store), but due to the compiler performing load hoisting and store
sinking from the loop. The compiler considers these two addresses as
two different addresses (they are also defined in two different named
address spaces), and optimizes access to them. So, without barrier(),
it simply loads the value from A20_TEST_ADDR and A20_TEST_ADDR+0x10
before the loop, resulting in:

  9:   65 8b 0d 00 02 00 00    mov    %gs:0x200,%ecx
 ...
 16:   64 8b 35 10 02 00 00    mov    %fs:0x210,%esi
 ...
 28:   83 ea 01                sub    $0x1,%edx
 2b:   74 0a                   je     37 <a20_test_ref+0x37>
 2d:   e6 80                   out    %al,$0x80
 2f:   89 d9                   mov    %ebx,%ecx
 31:   29 d1                   sub    %edx,%ecx
 33:   39 ce                   cmp    %ecx,%esi
 35:   74 f1                   je     28 <a20_test_ref+0x28>

The solution with barrier() introduces memory clobber between store
and load, so the compiler is now forced to load and store the values
due to the side effects of the barrier() inbetween. This kind of
works, but is just a workaround for what really happens. In reality,
the value at the test address changes "behind the compiler back", IOW
- variable’s value can change in ways the compiler cannot predict.

My proposal is to use a volatile pointer to an absolute address, so
the unwanted optimizations are suppressed. The generated code is the
same as with barrier(), but now the code tells the compiler that every
read and write to this address must happen exactly as written in the
source code. Before your patch, the accessors were defined with
volatile asm, and this is the place where volatile qualifier matters.
So, my proposed code would read:

#define A20_TEST_ADDR    (4*0x80)

#define A20_TEST_GS (*(volatile __seg_gs u32 *)A20_TEST_ADDR)
#define A20_TEST_FS (*(volatile __seg_fs u32 *)(A20_TEST_ADDR+0x10))

static int a20_test(int loops)
{
    int saved, ctr;

    set_fs(0xffff);

    saved = ctr = A20_TEST_GS;

    do {
        A20_TEST_GS = ++ctr;
        io_delay();    /* Make constant delay */
        if (A20_TEST_FS != ctr)
            break;
    } while (--loops);

    A20_TEST_GS = saved;
    return loops;
}

BR,
Uros.

re: [PATCH v1 08/14] x86: make CONFIG_EFI_STUB unconditional

Posted by Simon Glass 2 weeks, 3 days ago

Hi Peter,

On Tue, Jan 20, 2026 at 8:54 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> The EFI stub code is mature, most current x86 systems require EFI to
> boot, and as it is exclusively preboot code, it doesn't affect the
> runtime memory footprint at all.
> 
> It makes absolutely no sense to omit it anymore, so make it
> unconditional.
> 
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> ---
>  arch/x86/Kconfig                  | 14 ++------------
>  arch/x86/boot/compressed/Makefile |  2 --
>  arch/x86/boot/compressed/error.c  |  2 --
>  arch/x86/boot/header.S            |  3 ---
>  4 files changed, 2 insertions(+), 19 deletions(-)

At least with QEMU the EFI protocol adds quite a lot of overhead.

Is there any actual need for this?

Regards,
Simon

re: [PATCH v1 08/14] x86: make CONFIG_EFI_STUB unconditional

Posted by H. Peter Anvin 2 weeks, 3 days ago

On January 22, 2026 10:57:39 AM PST, Simon Glass <sjg@chromium.org> wrote:
>Hi Peter,
>
>On Tue, Jan 20, 2026 at 8:54 PM H. Peter Anvin <hpa@zytor.com> wrote:
>>
>> The EFI stub code is mature, most current x86 systems require EFI to
>> boot, and as it is exclusively preboot code, it doesn't affect the
>> runtime memory footprint at all.
>> 
>> It makes absolutely no sense to omit it anymore, so make it
>> unconditional.
>> 
>> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
>> ---
>>  arch/x86/Kconfig                  | 14 ++------------
>>  arch/x86/boot/compressed/Makefile |  2 --
>>  arch/x86/boot/compressed/error.c  |  2 --
>>  arch/x86/boot/header.S            |  3 ---
>>  4 files changed, 2 insertions(+), 19 deletions(-)
>
>At least with QEMU the EFI protocol adds quite a lot of overhead.
>
>Is there any actual need for this?
>
>Regards,
>Simon
>

Including the EFI stub doesn't mean using EFI to boot is required.