From: Ard Biesheuvel <ardb@kernel.org>
The secondary startup code is used on the primary boot path as well, but
in this case, the initial part runs from a 1:1 mapping, until an
explicit cross-jump is made to the kernel virtual mapping of the same
code.
On the secondary boot path, this jump is pointless as the code already
executes from the mapping targeted by the jump. So combine this
cross-jump with the jump from startup_64() into the common boot path.
This simplifies the execution flow, and clearly separates code that runs
from a 1:1 mapping from code that runs from the kernel virtual mapping.
Note that this requires a page table switch, so hoist the CR3 assignment
into startup_64() as well. And since absolute symbol references will no
longer be permitted in .head.text once we enable the associated build
time checks, a RIP-relative memory operand is used in the JMP
instruction, referring to an absolute constant in the .init.rodata
section.
Given that the secondary startup code does not require a special
placement inside the executable, move it to the .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/x86/kernel/head_64.S | 42 ++++++++++----------
1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c451a72bc92b..87929f615048 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -39,7 +39,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
- .text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
@@ -126,9 +125,21 @@ SYM_CODE_START_NOALIGN(startup_64)
call sev_verify_cbit
#endif
- jmp 1f
+ /*
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
+ */
+ movq %rax, %cr3
+
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *0f(%rip)
SYM_CODE_END(startup_64)
+ __INITRODATA
+0: .quad common_startup_64
+
+ .text
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -174,8 +185,15 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
#ifdef CONFIG_AMD_MEM_ENCRYPT
addq sme_me_mask(%rip), %rax
#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
+ movq %rax, %cr3
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
* Create a mask of CR4 bits to preserve. Omit PGE in order to clean
@@ -199,30 +217,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
btsl $X86_CR4_PSE_BIT, %ecx
movq %rcx, %cr4
- /*
- * Switch to new page-table
- *
- * For the boot CPU this switches to early_top_pgt which still has the
- * identity mappings present. The secondary CPUs will switch to the
- * init_top_pgt here, away from the trampoline_pgd and unmap the
- * identity mapped ranges.
- */
- movq %rax, %cr3
-
/*
* Set CR4.PGE to re-enable global translations.
*/
btsl $X86_CR4_PGE_BIT, %ecx
movq %rcx, %cr4
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
-1:
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // above
-
#ifdef CONFIG_SMP
/*
* For parallel boot, the APIC ID is read from the APIC, and then
--
2.44.0.rc1.240.g4c46232300-goog
First
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
for the patch.
On Tue, Feb 27, 2024 at 04:19:12PM +0100, Ard Biesheuvel wrote:
> + /*
> + * Switch to early_top_pgt which still has the identity mappings
> + * present.
I was wondering why we've had this "discrepancy" forever - the boot CPU
would have early_top_pgt *with* the ident mappings while the APs would do
init_top_pgt.
But we end up loading init_top_pgt on the BSP too in init_mem_mapping()
so there's a short time during boot where we have this difference.
I haven't found a reason to have it yet except "why bother"...
And now some details just for future reference:
On the BSP:
=> 0x10000a0: mov %rax,%cr3
cr3 0x9922000
111850: ffffffff89922000 8192 OBJECT GLOBAL DEFAULT 22 early_top_pgt
(gdb) p/x early_top_pgt
$3 = {{pgd = 0x9924063}, {pgd = 0x9924063}, {pgd = 0x0} <repeats 509 times>, {pgd = 0x2418067}}
first two PGDs and the last one are populated.
On the AP:
cr3 0x2416000
104747: ffffffff82416000 8192 OBJECT GLOBAL DEFAULT 12 init_top_pgt
(gdb) p/x (long[512])*0xffffffff82416000
$8 = {0x0 <repeats 273 times>, 0xbe01067, 0x0 <repeats 128 times>, 0xc000067, 0xc001067, 0xc002067, 0xc003067, 0xc004067, 0xc005067,
0xc006067, 0xc007067, 0xc008067, 0xc009067, 0xc00a067, 0xc00b067, 0xc00c067, 0xc00d067, 0xc00e067, 0xc00f067, 0xc010067, 0xc011067,
0xc012067, 0xc013067, 0xc014067, 0xc015067, 0xc016067, 0xc017067, 0xc018067, 0xc019067, 0xc01a067, 0xc01b067, 0xc01c067, 0xc01d067,
0xc01e067, 0xc01f067, 0xc020067, 0xc021067, 0xc022067, 0xc023067, 0xc024067, 0xc025067, 0xc026067, 0xc027067, 0xc028067, 0xc029067,
0xc02a067, 0xc02b067, 0xc02c067, 0xc02d067, 0xc02e067, 0xc02f067, 0xc030067, 0xc031067, 0xc032067, 0xc033067, 0xc034067, 0xc035067,
0xc036067, 0xc037067, 0xc038067, 0xc039067, 0xc03a067, 0xc03b067, 0xc03c067, 0xc03d067, 0xc03e067, 0xc03f067, 0x0, 0x0, 0x7ffd3067,
0x0 <repeats 39 times>, 0x7ffd1067, 0x0, 0x9b11067, 0x2418067}
and that one becomes the swapper_pg_dir which is the kernel pagetable we
use.
PTI then does two separate ones, which is a whole different topic.
:-)
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Thu, 29 Feb 2024 at 11:38, Borislav Petkov <bp@alien8.de> wrote:
>
> First
>
> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
>
> for the patch.
>
Thanks.
> On Tue, Feb 27, 2024 at 04:19:12PM +0100, Ard Biesheuvel wrote:
> > + /*
> > + * Switch to early_top_pgt which still has the identity mappings
> > + * present.
>
> I was wondering why we've had this "discrepancy" forever - the boot CPU
> would have early_top_pgt *with* the ident mappings while the APs would do
> init_top_pgt.
>
> But we end up loading init_top_pgt on the BSP too in init_mem_mapping()
> so there's a short time during boot where we have this difference.
> I haven't found a reason to have it yet except "why bother"...
>
Because we enter with a 1:1 mapping, and so we can only switch to
another set of page tables that also includes this 1:1 mapping. Once
we are running from the kernel mapping, we can drop the 1:1 mapping
but we still need it.
What we could do for robustness is reduce this 1:1 mapping to text +
rodata, and make it read-only, but I'm not sure it's worth the churn.
> And now some details just for future reference:
>
> On the BSP:
>
> => 0x10000a0: mov %rax,%cr3
>
> cr3 0x9922000
> 111850: ffffffff89922000 8192 OBJECT GLOBAL DEFAULT 22 early_top_pgt
>
> (gdb) p/x early_top_pgt
> $3 = {{pgd = 0x9924063}, {pgd = 0x9924063}, {pgd = 0x0} <repeats 509 times>, {pgd = 0x2418067}}
>
> first two PGDs and the last one are populated.
>
> On the AP:
>
> cr3 0x2416000
> 104747: ffffffff82416000 8192 OBJECT GLOBAL DEFAULT 12 init_top_pgt
>
> (gdb) p/x (long[512])*0xffffffff82416000
> $8 = {0x0 <repeats 273 times>, 0xbe01067, 0x0 <repeats 128 times>, 0xc000067, 0xc001067, 0xc002067, 0xc003067, 0xc004067, 0xc005067,
> 0xc006067, 0xc007067, 0xc008067, 0xc009067, 0xc00a067, 0xc00b067, 0xc00c067, 0xc00d067, 0xc00e067, 0xc00f067, 0xc010067, 0xc011067,
> 0xc012067, 0xc013067, 0xc014067, 0xc015067, 0xc016067, 0xc017067, 0xc018067, 0xc019067, 0xc01a067, 0xc01b067, 0xc01c067, 0xc01d067,
> 0xc01e067, 0xc01f067, 0xc020067, 0xc021067, 0xc022067, 0xc023067, 0xc024067, 0xc025067, 0xc026067, 0xc027067, 0xc028067, 0xc029067,
> 0xc02a067, 0xc02b067, 0xc02c067, 0xc02d067, 0xc02e067, 0xc02f067, 0xc030067, 0xc031067, 0xc032067, 0xc033067, 0xc034067, 0xc035067,
> 0xc036067, 0xc037067, 0xc038067, 0xc039067, 0xc03a067, 0xc03b067, 0xc03c067, 0xc03d067, 0xc03e067, 0xc03f067, 0x0, 0x0, 0x7ffd3067,
> 0x0 <repeats 39 times>, 0x7ffd1067, 0x0, 0x9b11067, 0x2418067}
>
> and that one becomes the swapper_pg_dir which is the kernel pagetable we
> use.
>
> PTI then does two separate ones, which is a whole different topic.
>
> :-)
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
On Thu, Feb 29, 2024 at 11:36:01PM +0100, Ard Biesheuvel wrote:
> Because we enter with a 1:1 mapping, and so we can only switch to
> another set of page tables that also includes this 1:1 mapping. Once
> we are running from the kernel mapping, we can drop the 1:1 mapping
> but we still need it.
>
> What we could do for robustness is reduce this 1:1 mapping to text +
> rodata, and make it read-only, but I'm not sure it's worth the churn.
Yeah, I was experimenting a bit with some shenanigans with those two
pagetables yesterday and arrived to a similar conclusion - there's no
point in trying to unify them.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
The following commit has been merged into the x86/boot branch of tip:
Commit-ID: 828263957611c210da00c1820db73fac217135b6
Gitweb: https://git.kernel.org/tip/828263957611c210da00c1820db73fac217135b6
Author: Ard Biesheuvel <ardb@kernel.org>
AuthorDate: Tue, 27 Feb 2024 16:19:12 +01:00
Committer: Borislav Petkov (AMD) <bp@alien8.de>
CommitterDate: Mon, 04 Mar 2024 18:12:20 +01:00
x86/startup_64: Simplify virtual switch on primary boot
The secondary startup code is used on the primary boot path as well, but
in this case, the initial part runs from a 1:1 mapping, until an
explicit cross-jump is made to the kernel virtual mapping of the same
code.
On the secondary boot path, this jump is pointless as the code already
executes from the mapping targeted by the jump. So combine this
cross-jump with the jump from startup_64() into the common boot path.
This simplifies the execution flow, and clearly separates code that runs
from a 1:1 mapping from code that runs from the kernel virtual mapping.
Note that this requires a page table switch, so hoist the CR3 assignment
into startup_64() as well. And since absolute symbol references will no
longer be permitted in .head.text once we enable the associated build
time checks, a RIP-relative memory operand is used in the JMP
instruction, referring to an absolute constant in the .init.rodata
section.
Given that the secondary startup code does not require a special
placement inside the executable, move it to the .text section.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lore.kernel.org/r/20240227151907.387873-15-ardb+git@google.com
---
arch/x86/kernel/head_64.S | 42 +++++++++++++++++++-------------------
1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b8b7118..79f7c34 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -39,7 +39,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
- .text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
@@ -126,9 +125,21 @@ SYM_CODE_START_NOALIGN(startup_64)
call sev_verify_cbit
#endif
- jmp 1f
+ /*
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
+ */
+ movq %rax, %cr3
+
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *0f(%rip)
SYM_CODE_END(startup_64)
+ __INITRODATA
+0: .quad common_startup_64
+
+ .text
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -174,8 +185,15 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
#ifdef CONFIG_AMD_MEM_ENCRYPT
addq sme_me_mask(%rip), %rax
#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
+ movq %rax, %cr3
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
* Create a mask of CR4 bits to preserve. Omit PGE in order to flush
@@ -205,29 +223,11 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movq %rcx, %cr4
/*
- * Switch to new page-table
- *
- * For the boot CPU this switches to early_top_pgt which still has the
- * identity mappings present. The secondary CPUs will switch to the
- * init_top_pgt here, away from the trampoline_pgd and unmap the
- * identity mapped ranges.
- */
- movq %rax, %cr3
-
- /*
* Set CR4.PGE to re-enable global translations.
*/
btsl $X86_CR4_PGE_BIT, %ecx
movq %rcx, %cr4
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
-1:
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // above
-
#ifdef CONFIG_SMP
/*
* For parallel boot, the APIC ID is read from the APIC, and then
© 2016 - 2025 Red Hat, Inc.