[PATCH v1 12/14] x86/boot: tweak a20.c for better code generation

H. Peter Anvin posted 14 patches 2 weeks, 5 days ago
[PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by H. Peter Anvin 2 weeks, 5 days ago
Do some minor tweaks to arch/x86/boot/a20.c for better code
generation, made possible by the __seg_fs/__seg_gs changes.

Move the die() call to a20.c itself; there is no reason to push an
error code upwards just to die() there.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
---
 arch/x86/boot/a20.c  | 24 +++++++++++-------------
 arch/x86/boot/boot.h |  2 +-
 arch/x86/boot/pm.c   |  3 +--
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 52c3fccdcb70..38a1cad8a553 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -53,24 +53,22 @@ static int empty_8042(void)
 
 static int a20_test(int loops)
 {
-	int ok = 0;
 	int saved, ctr;
 
 	set_fs(0xffff);
 
 	saved = ctr = rdgs32(A20_TEST_ADDR);
 
-	while (loops--) {
+	do {
 		wrgs32(++ctr, A20_TEST_ADDR);
 		io_delay();	/* Serialize and make delay constant */
 		barrier();	/* Compiler won't know about fs/gs overlap */
-		ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
-		if (ok)
+		if (rdfs32(A20_TEST_ADDR+0x10) != ctr)
 			break;
-	}
+	} while (--loops);
 
 	wrgs32(saved, A20_TEST_ADDR);
-	return ok;
+	return loops;
 }
 
 /* Quick test to see if A20 is already enabled */
@@ -125,7 +123,7 @@ static void enable_a20_fast(void)
 
 #define A20_ENABLE_LOOPS 255	/* Number of times to try */
 
-int enable_a20(void)
+void enable_a20(void)
 {
        int loops = A20_ENABLE_LOOPS;
        int kbc_err;
@@ -134,30 +132,30 @@ int enable_a20(void)
 	       /* First, check to see if A20 is already enabled
 		  (legacy free, etc.) */
 	       if (a20_test_short())
-		       return 0;
+		       return;
 
 	       /* Next, try the BIOS (INT 0x15, AX=0x2401) */
 	       enable_a20_bios();
 	       if (a20_test_short())
-		       return 0;
+		       return;
 
 	       /* Try enabling A20 through the keyboard controller */
 	       kbc_err = empty_8042();
 
 	       if (a20_test_short())
-		       return 0; /* BIOS worked, but with delayed reaction */
+		       return; /* BIOS worked, but with delayed reaction */
 
 	       if (!kbc_err) {
 		       enable_a20_kbc();
 		       if (a20_test_long())
-			       return 0;
+			      return;
 	       }
 
 	       /* Finally, try enabling the "fast A20 gate" */
 	       enable_a20_fast();
 	       if (a20_test_long())
-		       return 0;
+		       return;
        }
 
-       return -1;
+       die("A20 gate not responding, unable to boot...\n");
 }
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 4d3549ed7987..584c89d0738b 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -167,7 +167,7 @@ void copy_to_fs(addr_t dst, void *src, size_t len);
 void *copy_from_fs(void *dst, addr_t src, size_t len);
 
 /* a20.c */
-int enable_a20(void);
+void enable_a20(void);
 
 /* apm.c */
 int query_apm_bios(void);
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 3be89ba4b1b3..e39689ed65ea 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -106,8 +106,7 @@ void go_to_protected_mode(void)
 	realmode_switch_hook();
 
 	/* Enable the A20 gate */
-	if (enable_a20())
-		die("A20 gate not responding, unable to boot...\n");
+	enable_a20();
 
 	/* Reset coprocessor (IGNNE#) */
 	reset_coprocessor();
-- 
2.52.0
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by David Laight 2 weeks, 5 days ago
On Tue, 20 Jan 2026 11:54:04 -0800
"H. Peter Anvin" <hpa@zytor.com> wrote:

> Do some minor tweaks to arch/x86/boot/a20.c for better code
> generation, made possible by the __seg_fs/__seg_gs changes.
> 
> Move the die() call to a20.c itself; there is no reason to push an
> error code upwards just to die() there.

Can't this code just do:
	sv = *0000:addr;
	val = *ffff:addr+10;
	if (sv != val)
		return 1;
	*0000:addr = ~val;
	io_delay();
	val ^= *ffff:addr+10;
	*0000:addr = sv;
	return val != ~0;

That inverts all bits, inverting one is enough.
No loops needed.

	David

> 
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
> ---
>  arch/x86/boot/a20.c  | 24 +++++++++++-------------
>  arch/x86/boot/boot.h |  2 +-
>  arch/x86/boot/pm.c   |  3 +--
>  3 files changed, 13 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
> index 52c3fccdcb70..38a1cad8a553 100644
> --- a/arch/x86/boot/a20.c
> +++ b/arch/x86/boot/a20.c
> @@ -53,24 +53,22 @@ static int empty_8042(void)
>  
>  static int a20_test(int loops)
>  {
> -	int ok = 0;
>  	int saved, ctr;
>  
>  	set_fs(0xffff);
>  
>  	saved = ctr = rdgs32(A20_TEST_ADDR);
>  
> -	while (loops--) {
> +	do {
>  		wrgs32(++ctr, A20_TEST_ADDR);
>  		io_delay();	/* Serialize and make delay constant */
>  		barrier();	/* Compiler won't know about fs/gs overlap */
> -		ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
> -		if (ok)
> +		if (rdfs32(A20_TEST_ADDR+0x10) != ctr)
>  			break;
> -	}
> +	} while (--loops);
>  
>  	wrgs32(saved, A20_TEST_ADDR);
> -	return ok;
> +	return loops;
>  }
>  
>  /* Quick test to see if A20 is already enabled */
> @@ -125,7 +123,7 @@ static void enable_a20_fast(void)
>  
>  #define A20_ENABLE_LOOPS 255	/* Number of times to try */
>  
> -int enable_a20(void)
> +void enable_a20(void)
>  {
>         int loops = A20_ENABLE_LOOPS;
>         int kbc_err;
> @@ -134,30 +132,30 @@ int enable_a20(void)
>  	       /* First, check to see if A20 is already enabled
>  		  (legacy free, etc.) */
>  	       if (a20_test_short())
> -		       return 0;
> +		       return;
>  
>  	       /* Next, try the BIOS (INT 0x15, AX=0x2401) */
>  	       enable_a20_bios();
>  	       if (a20_test_short())
> -		       return 0;
> +		       return;
>  
>  	       /* Try enabling A20 through the keyboard controller */
>  	       kbc_err = empty_8042();
>  
>  	       if (a20_test_short())
> -		       return 0; /* BIOS worked, but with delayed reaction */
> +		       return; /* BIOS worked, but with delayed reaction */
>  
>  	       if (!kbc_err) {
>  		       enable_a20_kbc();
>  		       if (a20_test_long())
> -			       return 0;
> +			      return;
>  	       }
>  
>  	       /* Finally, try enabling the "fast A20 gate" */
>  	       enable_a20_fast();
>  	       if (a20_test_long())
> -		       return 0;
> +		       return;
>         }
>  
> -       return -1;
> +       die("A20 gate not responding, unable to boot...\n");
>  }
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index 4d3549ed7987..584c89d0738b 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -167,7 +167,7 @@ void copy_to_fs(addr_t dst, void *src, size_t len);
>  void *copy_from_fs(void *dst, addr_t src, size_t len);
>  
>  /* a20.c */
> -int enable_a20(void);
> +void enable_a20(void);
>  
>  /* apm.c */
>  int query_apm_bios(void);
> diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
> index 3be89ba4b1b3..e39689ed65ea 100644
> --- a/arch/x86/boot/pm.c
> +++ b/arch/x86/boot/pm.c
> @@ -106,8 +106,7 @@ void go_to_protected_mode(void)
>  	realmode_switch_hook();
>  
>  	/* Enable the A20 gate */
> -	if (enable_a20())
> -		die("A20 gate not responding, unable to boot...\n");
> +	enable_a20();
>  
>  	/* Reset coprocessor (IGNNE#) */
>  	reset_coprocessor();
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by Maciej W. Rozycki 2 weeks, 2 days ago
On Wed, 21 Jan 2026, David Laight wrote:

> No loops needed.

 A loop is needed because there can be a considerable delay from issuing 
the I/O request to flip the A20 gate till the circuitry responding.  This 
is particularly true with the command issued to the 8042 device, which is 
a microcontroller running its own firmware that needs it time to process 
an incoming request to drive one of the microcontroller's GPIOs.  There 
was a reason for port 0x92 circuitry later added to the PC architecture 
with the IBM PS/2 being called the "fast A20 gate".

  Maciej
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by H. Peter Anvin 2 weeks, 2 days ago
On 2026-01-23 19:00, Maciej W. Rozycki wrote:
> On Wed, 21 Jan 2026, David Laight wrote:
> 
>> No loops needed.
> 
>  A loop is needed because there can be a considerable delay from issuing 
> the I/O request to flip the A20 gate till the circuitry responding.  This 
> is particularly true with the command issued to the 8042 device, which is 
> a microcontroller running its own firmware that needs it time to process 
> an incoming request to drive one of the microcontroller's GPIOs.  There 
> was a reason for port 0x92 circuitry later added to the PC architecture 
> with the IBM PS/2 being called the "fast A20 gate".
> 

Indeed. I thought I had responded to this already but I hadn't, apparently.

Note that the "long" delay is 2^21 loops! That number wasn't taken out of the
air, either; we found machines that actually needed that many iterations.

In the case where A20 is enabled already, the loop terminates on either the
first or second iteration (the second iteration is when the value at 0x1000200
is exactly 1 higher than the value at 0x200.

Modern machines (Nehalem+) already have A20 enabled, and most machines of the
i686+ generation implement int 0x15 function 0x2401.

	-hpa
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by David Laight 2 weeks, 1 day ago
On Fri, 23 Jan 2026 20:24:55 -0800
"H. Peter Anvin" <hpa@zytor.com> wrote:

> On 2026-01-23 19:00, Maciej W. Rozycki wrote:
> > On Wed, 21 Jan 2026, David Laight wrote:
> >   
> >> No loops needed.  
> > 
> >  A loop is needed because there can be a considerable delay from issuing 
> > the I/O request to flip the A20 gate till the circuitry responding.  This 
> > is particularly true with the command issued to the 8042 device, which is 
> > a microcontroller running its own firmware that needs it time to process 
> > an incoming request to drive one of the microcontroller's GPIOs.  There 
> > was a reason for port 0x92 circuitry later added to the PC architecture 
> > with the IBM PS/2 being called the "fast A20 gate".
> >   
> 
> Indeed. I thought I had responded to this already but I hadn't, apparently.
> 
> Note that the "long" delay is 2^21 loops! That number wasn't taken out of the
> air, either; we found machines that actually needed that many iterations.

Ok, so you need a loop because it might take ages for the value read from
0x1000200 to change.
But there is no need to keep changing the value.
The comments in the code don't really stress that.

> In the case where A20 is enabled already, the loop terminates on either the
> first or second iteration (the second iteration is when the value at 0x1000200
> is exactly 1 higher than the value at 0x200.
> 
> Modern machines (Nehalem+) already have A20 enabled, and most machines of the
> i686+ generation implement int 0x15 function 0x2401.

I know some of the history.
And just read some more of the gory details...

A20 being disabled is there to make a 286 compatible with the older 8086 PCs
and any software that relied on address wrapping (rather than using it to get
an extra ~64kB in real mode).
That would be for dos and win 3.11...

The only 8088 and 286 cpu I used were on IO cards.

> 
> 	-hpa
>
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by H. Peter Anvin 2 weeks, 1 day ago
On January 24, 2026 3:07:41 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
>On Fri, 23 Jan 2026 20:24:55 -0800
>"H. Peter Anvin" <hpa@zytor.com> wrote:
>
>> On 2026-01-23 19:00, Maciej W. Rozycki wrote:
>> > On Wed, 21 Jan 2026, David Laight wrote:
>> >   
>> >> No loops needed.  
>> > 
>> >  A loop is needed because there can be a considerable delay from issuing 
>> > the I/O request to flip the A20 gate till the circuitry responding.  This 
>> > is particularly true with the command issued to the 8042 device, which is 
>> > a microcontroller running its own firmware that needs it time to process 
>> > an incoming request to drive one of the microcontroller's GPIOs.  There 
>> > was a reason for port 0x92 circuitry later added to the PC architecture 
>> > with the IBM PS/2 being called the "fast A20 gate".
>> >   
>> 
>> Indeed. I thought I had responded to this already but I hadn't, apparently.
>> 
>> Note that the "long" delay is 2^21 loops! That number wasn't taken out of the
>> air, either; we found machines that actually needed that many iterations.
>
>Ok, so you need a loop because it might take ages for the value read from
>0x1000200 to change.
>But there is no need to keep changing the value.
>The comments in the code don't really stress that.
>
>> In the case where A20 is enabled already, the loop terminates on either the
>> first or second iteration (the second iteration is when the value at 0x1000200
>> is exactly 1 higher than the value at 0x200.
>> 
>> Modern machines (Nehalem+) already have A20 enabled, and most machines of the
>> i686+ generation implement int 0x15 function 0x2401.
>
>I know some of the history.
>And just read some more of the gory details...
>
>A20 being disabled is there to make a 286 compatible with the older 8086 PCs
>and any software that relied on address wrapping (rather than using it to get
>an extra ~64kB in real mode).
>That would be for dos and win 3.11...
>
>The only 8088 and 286 cpu I used were on IO cards.
>
>> 
>> 	-hpa
>> 
>

No, there is a reason to keep changing the value: you have no idea what is currently stored in that memory, *and you have no way of knowing*.

Whatever value you write might purely accidentally be the value that already is stored at that memory location.
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by H. Peter Anvin 2 weeks, 1 day ago
On January 24, 2026 3:16:18 PM PST, "H. Peter Anvin" <hpa@zytor.com> wrote:
>On January 24, 2026 3:07:41 PM PST, David Laight <david.laight.linux@gmail.com> wrote:
>>On Fri, 23 Jan 2026 20:24:55 -0800
>>"H. Peter Anvin" <hpa@zytor.com> wrote:
>>
>>> On 2026-01-23 19:00, Maciej W. Rozycki wrote:
>>> > On Wed, 21 Jan 2026, David Laight wrote:
>>> >   
>>> >> No loops needed.  
>>> > 
>>> >  A loop is needed because there can be a considerable delay from issuing 
>>> > the I/O request to flip the A20 gate till the circuitry responding.  This 
>>> > is particularly true with the command issued to the 8042 device, which is 
>>> > a microcontroller running its own firmware that needs it time to process 
>>> > an incoming request to drive one of the microcontroller's GPIOs.  There 
>>> > was a reason for port 0x92 circuitry later added to the PC architecture 
>>> > with the IBM PS/2 being called the "fast A20 gate".
>>> >   
>>> 
>>> Indeed. I thought I had responded to this already but I hadn't, apparently.
>>> 
>>> Note that the "long" delay is 2^21 loops! That number wasn't taken out of the
>>> air, either; we found machines that actually needed that many iterations.
>>
>>Ok, so you need a loop because it might take ages for the value read from
>>0x1000200 to change.
>>But there is no need to keep changing the value.
>>The comments in the code don't really stress that.
>>
>>> In the case where A20 is enabled already, the loop terminates on either the
>>> first or second iteration (the second iteration is when the value at 0x1000200
>>> is exactly 1 higher than the value at 0x200.
>>> 
>>> Modern machines (Nehalem+) already have A20 enabled, and most machines of the
>>> i686+ generation implement int 0x15 function 0x2401.
>>
>>I know some of the history.
>>And just read some more of the gory details...
>>
>>A20 being disabled is there to make a 286 compatible with the older 8086 PCs
>>and any software that relied on address wrapping (rather than using it to get
>>an extra ~64kB in real mode).
>>That would be for dos and win 3.11...
>>
>>The only 8088 and 286 cpu I used were on IO cards.
>>
>>> 
>>> 	-hpa
>>> 
>>
>
>No, there is a reason to keep changing the value: you have no idea what is currently stored in that memory, *and you have no way of knowing*.
>
>Whatever value you write might purely accidentally be the value that already is stored at that memory location.
The other thing about this code is that performance is irrelevant – it is a busy wait loop! – but consistency (hence the io_delay) and code size matter.
Re: [PATCH v1 12/14] x86/boot: tweak a20.c for better code generation
Posted by Uros Bizjak 2 weeks, 5 days ago
On Tue, Jan 20, 2026 at 8:54 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> Do some minor tweaks to arch/x86/boot/a20.c for better code
> generation, made possible by the __seg_fs/__seg_gs changes.
>
> Move the die() call to a20.c itself; there is no reason to push an
> error code upwards just to die() there.
>
> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>

Acked-by: Uros Bizjak <ubizjak@gmail.com>

> ---
>  arch/x86/boot/a20.c  | 24 +++++++++++-------------
>  arch/x86/boot/boot.h |  2 +-
>  arch/x86/boot/pm.c   |  3 +--
>  3 files changed, 13 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
> index 52c3fccdcb70..38a1cad8a553 100644
> --- a/arch/x86/boot/a20.c
> +++ b/arch/x86/boot/a20.c
> @@ -53,24 +53,22 @@ static int empty_8042(void)
>
>  static int a20_test(int loops)
>  {
> -       int ok = 0;
>         int saved, ctr;
>
>         set_fs(0xffff);
>
>         saved = ctr = rdgs32(A20_TEST_ADDR);
>
> -       while (loops--) {
> +       do {
>                 wrgs32(++ctr, A20_TEST_ADDR);
>                 io_delay();     /* Serialize and make delay constant */
>                 barrier();      /* Compiler won't know about fs/gs overlap */
> -               ok = rdfs32(A20_TEST_ADDR+0x10) ^ ctr;
> -               if (ok)
> +               if (rdfs32(A20_TEST_ADDR+0x10) != ctr)
>                         break;
> -       }
> +       } while (--loops);
>
>         wrgs32(saved, A20_TEST_ADDR);
> -       return ok;
> +       return loops;
>  }
>
>  /* Quick test to see if A20 is already enabled */
> @@ -125,7 +123,7 @@ static void enable_a20_fast(void)
>
>  #define A20_ENABLE_LOOPS 255   /* Number of times to try */
>
> -int enable_a20(void)
> +void enable_a20(void)
>  {
>         int loops = A20_ENABLE_LOOPS;
>         int kbc_err;
> @@ -134,30 +132,30 @@ int enable_a20(void)
>                /* First, check to see if A20 is already enabled
>                   (legacy free, etc.) */
>                if (a20_test_short())
> -                      return 0;
> +                      return;
>
>                /* Next, try the BIOS (INT 0x15, AX=0x2401) */
>                enable_a20_bios();
>                if (a20_test_short())
> -                      return 0;
> +                      return;
>
>                /* Try enabling A20 through the keyboard controller */
>                kbc_err = empty_8042();
>
>                if (a20_test_short())
> -                      return 0; /* BIOS worked, but with delayed reaction */
> +                      return; /* BIOS worked, but with delayed reaction */
>
>                if (!kbc_err) {
>                        enable_a20_kbc();
>                        if (a20_test_long())
> -                              return 0;
> +                             return;
>                }
>
>                /* Finally, try enabling the "fast A20 gate" */
>                enable_a20_fast();
>                if (a20_test_long())
> -                      return 0;
> +                      return;
>         }
>
> -       return -1;
> +       die("A20 gate not responding, unable to boot...\n");
>  }
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index 4d3549ed7987..584c89d0738b 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -167,7 +167,7 @@ void copy_to_fs(addr_t dst, void *src, size_t len);
>  void *copy_from_fs(void *dst, addr_t src, size_t len);
>
>  /* a20.c */
> -int enable_a20(void);
> +void enable_a20(void);
>
>  /* apm.c */
>  int query_apm_bios(void);
> diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
> index 3be89ba4b1b3..e39689ed65ea 100644
> --- a/arch/x86/boot/pm.c
> +++ b/arch/x86/boot/pm.c
> @@ -106,8 +106,7 @@ void go_to_protected_mode(void)
>         realmode_switch_hook();
>
>         /* Enable the A20 gate */
> -       if (enable_a20())
> -               die("A20 gate not responding, unable to boot...\n");
> +       enable_a20();
>
>         /* Reset coprocessor (IGNNE#) */
>         reset_coprocessor();
> --
> 2.52.0
>